Coverage Report

Created: 2024-11-21 07:03

/src/boringssl/crypto/fipsmodule/aes/aes_nohw.c.inc
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2019, Google Inc.
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15
#include <openssl/aes.h>
16
17
#include <assert.h>
18
#include <string.h>
19
20
#include "../../internal.h"
21
#include "internal.h"
22
23
#if defined(OPENSSL_SSE2)
24
#include <emmintrin.h>
25
#endif
26
27
28
// This file contains a constant-time implementation of AES, bitsliced with
29
// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
30
// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
31
//
32
// This implementation is based on the algorithms described in the following
33
// references:
34
// - https://bearssl.org/constanttime.html#aes
35
// - https://eprint.iacr.org/2009/129.pdf
36
// - https://eprint.iacr.org/2009/191.pdf
37
38
39
// Word operations.
40
//
41
// An aes_word_t is the word used for this AES implementation. Throughout this
42
// file, bits and bytes are ordered little-endian, though "left" and "right"
43
// shifts match the operations themselves, which makes them reversed in a
44
// little-endian, left-to-right reading.
45
//
46
// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
47
// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
48
// bits each, each corresponding to a byte in an AES block in column-major
49
// order (AES's byte order). We refer to these as "logical bytes". Note, in the
50
// 32-bit and 64-bit implementations, they are smaller than a byte. (The
51
// contents of a logical byte will be described later.)
52
//
53
// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
54
// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
55
// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
56
// value ranges from 0 to 15 independent of |aes_word_t| and
57
// |AES_NOHW_BATCH_SIZE|.
58
//
59
// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
60
// uses row-major order. Matching the AES order was easier to reason about, and
61
// we do not have PSHUFB available to arbitrarily permute bytes.
62
63
#if defined(OPENSSL_SSE2)
64
typedef __m128i aes_word_t;
65
// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
66
// MSVC, so we define a constant.
67
#define AES_NOHW_WORD_SIZE 16
68
4.18M
#define AES_NOHW_BATCH_SIZE 8
69
#define AES_NOHW_ROW0_MASK \
70
3.34M
  _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
71
#define AES_NOHW_ROW1_MASK \
72
3.34M
  _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
73
#define AES_NOHW_ROW2_MASK \
74
3.34M
  _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
75
#define AES_NOHW_ROW3_MASK \
76
3.34M
  _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
77
#define AES_NOHW_COL01_MASK \
78
8
  _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
79
#define AES_NOHW_COL2_MASK \
80
8
  _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
81
#define AES_NOHW_COL3_MASK \
82
8
  _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
83
84
29.9M
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
85
29.9M
  return _mm_and_si128(a, b);
86
29.9M
}
87
88
20.0M
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
89
20.0M
  return _mm_or_si128(a, b);
90
20.0M
}
91
92
57.7M
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
93
57.7M
  return _mm_xor_si128(a, b);
94
57.7M
}
95
96
2.08M
static inline aes_word_t aes_nohw_not(aes_word_t a) {
97
2.08M
  return _mm_xor_si128(
98
2.08M
      a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
99
2.08M
}
100
101
// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
102
// must be constants.
103
#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
104
10.3M
  _mm_slli_si128((a), (i))
105
#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
106
10.1M
  _mm_srli_si128((a), (i))
107
#else  // !OPENSSL_SSE2
108
#if defined(OPENSSL_64_BIT)
109
typedef uint64_t aes_word_t;
110
#define AES_NOHW_WORD_SIZE 8
111
#define AES_NOHW_BATCH_SIZE 4
112
#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
113
#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
114
#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
115
#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
116
#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
117
#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
118
#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
119
#else  // !OPENSSL_64_BIT
120
typedef uint32_t aes_word_t;
121
#define AES_NOHW_WORD_SIZE 4
122
#define AES_NOHW_BATCH_SIZE 2
123
#define AES_NOHW_ROW0_MASK 0x03030303
124
#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
125
#define AES_NOHW_ROW2_MASK 0x30303030
126
#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
127
#define AES_NOHW_COL01_MASK 0x0000ffff
128
#define AES_NOHW_COL2_MASK 0x00ff0000
129
#define AES_NOHW_COL3_MASK 0xff000000
130
#endif  // OPENSSL_64_BIT
131
132
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
133
  return a & b;
134
}
135
136
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
137
  return a | b;
138
}
139
140
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
141
  return a ^ b;
142
}
143
144
static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
145
146
static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
147
  return a << (i * AES_NOHW_BATCH_SIZE);
148
}
149
150
static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
151
  return a >> (i * AES_NOHW_BATCH_SIZE);
152
}
153
#endif  // OPENSSL_SSE2
154
155
static_assert(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
156
              "batch size does not match word size");
157
static_assert(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
158
              "AES_NOHW_WORD_SIZE is incorrect");
159
160
161
// Block representations.
162
//
163
// This implementation uses three representations for AES blocks. First, the
164
// public API represents blocks as uint8_t[16] in the usual way. Second, most
165
// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
166
// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
167
// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
168
// bars divide logical bytes):
169
//
170
//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
171
//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
172
//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
173
//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
174
//   ...
175
//
176
// Finally, an individual block may be stored as an intermediate form in an
177
// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
178
// block, so that block[0]'s ith logical byte contains least-significant
179
// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
180
// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
181
// "compacting" the block. Note this is no-op with 128-bit words because then
182
// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
183
// words, one block would be stored in two words:
184
//
185
//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
186
//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
187
//
188
// Observe that the distances between corresponding bits in bitsliced and
189
// compact bit orders match. If we line up corresponding words of each block,
190
// the bitsliced and compact representations may be converted by tranposing bits
191
// in corresponding logical bytes. Continuing the 64-bit example:
192
//
193
//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
194
//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
195
//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
196
//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
197
//
198
//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
199
//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
200
//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
201
//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
202
//
203
// Note also that bitwise operations and (logical) byte permutations on an
204
// |aes_word_t| work equally for the bitsliced and compact words.
205
//
206
// We use the compact form in the |AES_KEY| representation to save work
207
// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
208
// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
209
// before or after |aes_nohw_transpose|.
210
211
200k
#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
212
213
// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
214
// specified, it is in bitsliced form.
215
typedef struct {
216
  aes_word_t w[8];
217
} AES_NOHW_BATCH;
218
219
// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
220
// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
221
// |AES_KEY|s so it should not be used as a long-term key representation.
222
typedef struct {
223
  // keys is an array of batches, one for each round key. Each batch stores
224
  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
225
  AES_NOHW_BATCH keys[AES_MAXNR + 1];
226
} AES_NOHW_SCHEDULE;
227
228
// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
229
// compact form.
230
static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
231
                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
232
3.69M
                                      size_t i) {
233
  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
234
  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
235
  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
236
  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
237
  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
238
  // will be correctly placed.)
239
3.69M
  assert(i < AES_NOHW_BATCH_SIZE);
240
3.69M
#if defined(OPENSSL_SSE2)
241
3.69M
  batch->w[i] = in[0];
242
#elif defined(OPENSSL_64_BIT)
243
  batch->w[i] = in[0];
244
  batch->w[i + 4] = in[1];
245
#else
246
  batch->w[i] = in[0];
247
  batch->w[i + 2] = in[1];
248
  batch->w[i + 4] = in[2];
249
  batch->w[i + 6] = in[3];
250
#endif
251
3.69M
}
252
253
// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
254
// compact form.
255
static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
256
                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
257
140k
                                      size_t i) {
258
140k
  assert(i < AES_NOHW_BATCH_SIZE);
259
140k
#if defined(OPENSSL_SSE2)
260
140k
  out[0] = batch->w[i];
261
#elif defined(OPENSSL_64_BIT)
262
  out[0] = batch->w[i];
263
  out[1] = batch->w[i + 4];
264
#else
265
  out[0] = batch->w[i];
266
  out[1] = batch->w[i + 2];
267
  out[2] = batch->w[i + 4];
268
  out[3] = batch->w[i + 6];
269
#endif
270
140k
}
271
272
#if !defined(OPENSSL_SSE2)
273
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
274
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
275
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
276
                                             aes_word_t shift) {
277
  // See
278
  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
279
  aes_word_t b = (a ^ (a >> shift)) & mask;
280
  return a ^ b ^ (b << shift);
281
}
282
283
// In the 32-bit and 64-bit implementations, a block spans multiple words.
284
// |aes_nohw_compact_block| must permute bits across different words. First we
285
// implement |aes_nohw_compact_word| which performs a smaller version of the
286
// transformation which stays within a single word.
287
//
288
// These transformations are generalizations of the output of
289
// http://programming.sirrida.de/calcperm.php on smaller inputs.
290
#if defined(OPENSSL_64_BIT)
291
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
292
  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
293
  // quartets of those chunks:
294
  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
295
  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
296
  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
297
  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
298
  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
299
  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
300
  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
301
  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
302
  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
303
  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
304
  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
305
  return a;
306
}
307
308
static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
309
  // Reverse the steps of |aes_nohw_uncompact_word|.
310
  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
311
  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
312
  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
313
  return a;
314
}
315
#else   // !OPENSSL_64_BIT
316
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
317
  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
318
  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
319
  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
320
  // Note:  0x00cc = 0b0000_0000_1100_1100
321
  //   0x00cc << 6 = 0b0011_0011_0000_0000
322
  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
323
  // Now we swap groups of four bits (still numbering by pairs):
324
  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
325
  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
326
  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
327
  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
328
  return a;
329
}
330
331
static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
332
  // Reverse the steps of |aes_nohw_uncompact_word|.
333
  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
334
  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
335
  return a;
336
}
337
338
static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
339
                                                uint8_t a2, uint8_t a3) {
340
  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
341
         ((uint32_t)a3 << 24);
342
}
343
#endif  // OPENSSL_64_BIT
344
#endif  // !OPENSSL_SSE2
345
346
static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
347
55.6k
                                          const uint8_t in[16]) {
348
55.6k
  memcpy(out, in, 16);
349
55.6k
#if defined(OPENSSL_SSE2)
350
  // No conversions needed.
351
#elif defined(OPENSSL_64_BIT)
352
  uint64_t a0 = aes_nohw_compact_word(out[0]);
353
  uint64_t a1 = aes_nohw_compact_word(out[1]);
354
  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
355
  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
356
#else
357
  uint32_t a0 = aes_nohw_compact_word(out[0]);
358
  uint32_t a1 = aes_nohw_compact_word(out[1]);
359
  uint32_t a2 = aes_nohw_compact_word(out[2]);
360
  uint32_t a3 = aes_nohw_compact_word(out[3]);
361
  // Note clang, when building for ARM Thumb2, will sometimes miscompile
362
  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
363
  // without optimizations. This bug was introduced in
364
  // https://reviews.llvm.org/rL340261 and fixed in
365
  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
366
  out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3);
367
  out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
368
  out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
369
  out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
370
#endif
371
55.6k
}
372
373
static inline void aes_nohw_uncompact_block(
374
40.7k
    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
375
40.7k
#if defined(OPENSSL_SSE2)
376
40.7k
  memcpy(out, in, 16);  // No conversions needed.
377
#elif defined(OPENSSL_64_BIT)
378
  uint64_t a0 = in[0];
379
  uint64_t a1 = in[1];
380
  uint64_t b0 =
381
      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
382
  uint64_t b1 =
383
      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
384
  memcpy(out, &b0, 8);
385
  memcpy(out + 8, &b1, 8);
386
#else
387
  uint32_t a0 = in[0];
388
  uint32_t a1 = in[1];
389
  uint32_t a2 = in[2];
390
  uint32_t a3 = in[3];
391
  // Note clang, when building for ARM Thumb2, will sometimes miscompile
392
  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
393
  // without optimizations. This bug was introduced in
394
  // https://reviews.llvm.org/rL340261 and fixed in
395
  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
396
  uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3);
397
  uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
398
  uint32_t b2 =
399
      aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
400
  uint32_t b3 =
401
      aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
402
  b0 = aes_nohw_uncompact_word(b0);
403
  b1 = aes_nohw_uncompact_word(b1);
404
  b2 = aes_nohw_uncompact_word(b2);
405
  b3 = aes_nohw_uncompact_word(b3);
406
  memcpy(out, &b0, 4);
407
  memcpy(out + 4, &b1, 4);
408
  memcpy(out + 8, &b2, 4);
409
  memcpy(out + 12, &b3, 4);
410
#endif
411
40.7k
}
412
413
// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
414
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
415
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
416
// is repeated to the full width of |aes_word_t|.
417
#if defined(OPENSSL_SSE2)
418
// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
419
// constant shift values.
420
#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
421
                           /* uint32_t */ mask, /* const */ shift)        \
422
8.47M
  do {                                                                    \
423
8.47M
    __m128i swap =                                                        \
424
8.47M
        _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
425
8.47M
                      _mm_set_epi32((mask), (mask), (mask), (mask)));     \
426
8.47M
    *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift)));            \
427
8.47M
    *(b) = _mm_xor_si128(*(b), swap);                                     \
428
8.47M
                                                                          \
429
8.47M
  } while (0)
430
#else
431
static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
432
                                      uint32_t mask, aes_word_t shift) {
433
#if defined(OPENSSL_64_BIT)
434
  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
435
#else
436
  aes_word_t mask_w = mask;
437
#endif
438
  // This is a variation on a delta swap.
439
  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
440
  *a ^= swap << shift;
441
  *b ^= swap;
442
}
443
#endif  // OPENSSL_SSE2
444
445
// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
446
// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
447
// and transposes each square.
448
706k
static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
449
  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
450
706k
  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
451
706k
  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
452
706k
  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
453
706k
  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
454
455
706k
#if AES_NOHW_BATCH_SIZE >= 4
456
  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
457
706k
  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
458
706k
  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
459
706k
  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
460
706k
  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
461
706k
#endif
462
463
706k
#if AES_NOHW_BATCH_SIZE >= 8
464
  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
465
706k
  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
466
706k
  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
467
706k
  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
468
706k
  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
469
706k
#endif
470
706k
}
471
472
// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
473
// |num_blocks| must be at most |AES_NOHW_BATCH|.
474
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
475
31.0k
                              size_t num_blocks) {
476
  // Don't leave unused blocks uninitialized.
477
31.0k
  memset(out, 0, sizeof(AES_NOHW_BATCH));
478
31.0k
  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
479
71.7k
  for (size_t i = 0; i < num_blocks; i++) {
480
40.7k
    aes_word_t block[AES_NOHW_BLOCK_WORDS];
481
40.7k
    aes_nohw_compact_block(block, in + 16 * i);
482
40.7k
    aes_nohw_batch_set(out, block, i);
483
40.7k
  }
484
485
31.0k
  aes_nohw_transpose(out);
486
31.0k
}
487
488
// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
489
// |num_blocks| must be at most |AES_NOHW_BATCH|.
490
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
491
31.0k
                                const AES_NOHW_BATCH *batch) {
492
31.0k
  AES_NOHW_BATCH copy = *batch;
493
31.0k
  aes_nohw_transpose(&copy);
494
495
31.0k
  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
496
71.7k
  for (size_t i = 0; i < num_blocks; i++) {
497
40.7k
    aes_word_t block[AES_NOHW_BLOCK_WORDS];
498
40.7k
    aes_nohw_batch_get(&copy, block, i);
499
40.7k
    aes_nohw_uncompact_block(out + 16 * i, block);
500
40.7k
  }
501
31.0k
}
502
503
504
// AES round steps.
505
506
static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
507
448k
                                   const AES_NOHW_BATCH *key) {
508
4.04M
  for (size_t i = 0; i < 8; i++) {
509
3.59M
    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
510
3.59M
  }
511
448k
}
512
513
517k
static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
514
  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
515
517k
  aes_word_t x0 = batch->w[7];
516
517k
  aes_word_t x1 = batch->w[6];
517
517k
  aes_word_t x2 = batch->w[5];
518
517k
  aes_word_t x3 = batch->w[4];
519
517k
  aes_word_t x4 = batch->w[3];
520
517k
  aes_word_t x5 = batch->w[2];
521
517k
  aes_word_t x6 = batch->w[1];
522
517k
  aes_word_t x7 = batch->w[0];
523
524
  // Figure 2, the top linear transformation.
525
517k
  aes_word_t y14 = aes_nohw_xor(x3, x5);
526
517k
  aes_word_t y13 = aes_nohw_xor(x0, x6);
527
517k
  aes_word_t y9 = aes_nohw_xor(x0, x3);
528
517k
  aes_word_t y8 = aes_nohw_xor(x0, x5);
529
517k
  aes_word_t t0 = aes_nohw_xor(x1, x2);
530
517k
  aes_word_t y1 = aes_nohw_xor(t0, x7);
531
517k
  aes_word_t y4 = aes_nohw_xor(y1, x3);
532
517k
  aes_word_t y12 = aes_nohw_xor(y13, y14);
533
517k
  aes_word_t y2 = aes_nohw_xor(y1, x0);
534
517k
  aes_word_t y5 = aes_nohw_xor(y1, x6);
535
517k
  aes_word_t y3 = aes_nohw_xor(y5, y8);
536
517k
  aes_word_t t1 = aes_nohw_xor(x4, y12);
537
517k
  aes_word_t y15 = aes_nohw_xor(t1, x5);
538
517k
  aes_word_t y20 = aes_nohw_xor(t1, x1);
539
517k
  aes_word_t y6 = aes_nohw_xor(y15, x7);
540
517k
  aes_word_t y10 = aes_nohw_xor(y15, t0);
541
517k
  aes_word_t y11 = aes_nohw_xor(y20, y9);
542
517k
  aes_word_t y7 = aes_nohw_xor(x7, y11);
543
517k
  aes_word_t y17 = aes_nohw_xor(y10, y11);
544
517k
  aes_word_t y19 = aes_nohw_xor(y10, y8);
545
517k
  aes_word_t y16 = aes_nohw_xor(t0, y11);
546
517k
  aes_word_t y21 = aes_nohw_xor(y13, y16);
547
517k
  aes_word_t y18 = aes_nohw_xor(x0, y16);
548
549
  // Figure 3, the middle non-linear section.
550
517k
  aes_word_t t2 = aes_nohw_and(y12, y15);
551
517k
  aes_word_t t3 = aes_nohw_and(y3, y6);
552
517k
  aes_word_t t4 = aes_nohw_xor(t3, t2);
553
517k
  aes_word_t t5 = aes_nohw_and(y4, x7);
554
517k
  aes_word_t t6 = aes_nohw_xor(t5, t2);
555
517k
  aes_word_t t7 = aes_nohw_and(y13, y16);
556
517k
  aes_word_t t8 = aes_nohw_and(y5, y1);
557
517k
  aes_word_t t9 = aes_nohw_xor(t8, t7);
558
517k
  aes_word_t t10 = aes_nohw_and(y2, y7);
559
517k
  aes_word_t t11 = aes_nohw_xor(t10, t7);
560
517k
  aes_word_t t12 = aes_nohw_and(y9, y11);
561
517k
  aes_word_t t13 = aes_nohw_and(y14, y17);
562
517k
  aes_word_t t14 = aes_nohw_xor(t13, t12);
563
517k
  aes_word_t t15 = aes_nohw_and(y8, y10);
564
517k
  aes_word_t t16 = aes_nohw_xor(t15, t12);
565
517k
  aes_word_t t17 = aes_nohw_xor(t4, t14);
566
517k
  aes_word_t t18 = aes_nohw_xor(t6, t16);
567
517k
  aes_word_t t19 = aes_nohw_xor(t9, t14);
568
517k
  aes_word_t t20 = aes_nohw_xor(t11, t16);
569
517k
  aes_word_t t21 = aes_nohw_xor(t17, y20);
570
517k
  aes_word_t t22 = aes_nohw_xor(t18, y19);
571
517k
  aes_word_t t23 = aes_nohw_xor(t19, y21);
572
517k
  aes_word_t t24 = aes_nohw_xor(t20, y18);
573
517k
  aes_word_t t25 = aes_nohw_xor(t21, t22);
574
517k
  aes_word_t t26 = aes_nohw_and(t21, t23);
575
517k
  aes_word_t t27 = aes_nohw_xor(t24, t26);
576
517k
  aes_word_t t28 = aes_nohw_and(t25, t27);
577
517k
  aes_word_t t29 = aes_nohw_xor(t28, t22);
578
517k
  aes_word_t t30 = aes_nohw_xor(t23, t24);
579
517k
  aes_word_t t31 = aes_nohw_xor(t22, t26);
580
517k
  aes_word_t t32 = aes_nohw_and(t31, t30);
581
517k
  aes_word_t t33 = aes_nohw_xor(t32, t24);
582
517k
  aes_word_t t34 = aes_nohw_xor(t23, t33);
583
517k
  aes_word_t t35 = aes_nohw_xor(t27, t33);
584
517k
  aes_word_t t36 = aes_nohw_and(t24, t35);
585
517k
  aes_word_t t37 = aes_nohw_xor(t36, t34);
586
517k
  aes_word_t t38 = aes_nohw_xor(t27, t36);
587
517k
  aes_word_t t39 = aes_nohw_and(t29, t38);
588
517k
  aes_word_t t40 = aes_nohw_xor(t25, t39);
589
517k
  aes_word_t t41 = aes_nohw_xor(t40, t37);
590
517k
  aes_word_t t42 = aes_nohw_xor(t29, t33);
591
517k
  aes_word_t t43 = aes_nohw_xor(t29, t40);
592
517k
  aes_word_t t44 = aes_nohw_xor(t33, t37);
593
517k
  aes_word_t t45 = aes_nohw_xor(t42, t41);
594
517k
  aes_word_t z0 = aes_nohw_and(t44, y15);
595
517k
  aes_word_t z1 = aes_nohw_and(t37, y6);
596
517k
  aes_word_t z2 = aes_nohw_and(t33, x7);
597
517k
  aes_word_t z3 = aes_nohw_and(t43, y16);
598
517k
  aes_word_t z4 = aes_nohw_and(t40, y1);
599
517k
  aes_word_t z5 = aes_nohw_and(t29, y7);
600
517k
  aes_word_t z6 = aes_nohw_and(t42, y11);
601
517k
  aes_word_t z7 = aes_nohw_and(t45, y17);
602
517k
  aes_word_t z8 = aes_nohw_and(t41, y10);
603
517k
  aes_word_t z9 = aes_nohw_and(t44, y12);
604
517k
  aes_word_t z10 = aes_nohw_and(t37, y3);
605
517k
  aes_word_t z11 = aes_nohw_and(t33, y4);
606
517k
  aes_word_t z12 = aes_nohw_and(t43, y13);
607
517k
  aes_word_t z13 = aes_nohw_and(t40, y5);
608
517k
  aes_word_t z14 = aes_nohw_and(t29, y2);
609
517k
  aes_word_t z15 = aes_nohw_and(t42, y9);
610
517k
  aes_word_t z16 = aes_nohw_and(t45, y14);
611
517k
  aes_word_t z17 = aes_nohw_and(t41, y8);
612
613
  // Figure 4, bottom linear transformation.
614
517k
  aes_word_t t46 = aes_nohw_xor(z15, z16);
615
517k
  aes_word_t t47 = aes_nohw_xor(z10, z11);
616
517k
  aes_word_t t48 = aes_nohw_xor(z5, z13);
617
517k
  aes_word_t t49 = aes_nohw_xor(z9, z10);
618
517k
  aes_word_t t50 = aes_nohw_xor(z2, z12);
619
517k
  aes_word_t t51 = aes_nohw_xor(z2, z5);
620
517k
  aes_word_t t52 = aes_nohw_xor(z7, z8);
621
517k
  aes_word_t t53 = aes_nohw_xor(z0, z3);
622
517k
  aes_word_t t54 = aes_nohw_xor(z6, z7);
623
517k
  aes_word_t t55 = aes_nohw_xor(z16, z17);
624
517k
  aes_word_t t56 = aes_nohw_xor(z12, t48);
625
517k
  aes_word_t t57 = aes_nohw_xor(t50, t53);
626
517k
  aes_word_t t58 = aes_nohw_xor(z4, t46);
627
517k
  aes_word_t t59 = aes_nohw_xor(z3, t54);
628
517k
  aes_word_t t60 = aes_nohw_xor(t46, t57);
629
517k
  aes_word_t t61 = aes_nohw_xor(z14, t57);
630
517k
  aes_word_t t62 = aes_nohw_xor(t52, t58);
631
517k
  aes_word_t t63 = aes_nohw_xor(t49, t58);
632
517k
  aes_word_t t64 = aes_nohw_xor(z4, t59);
633
517k
  aes_word_t t65 = aes_nohw_xor(t61, t62);
634
517k
  aes_word_t t66 = aes_nohw_xor(z1, t63);
635
517k
  aes_word_t s0 = aes_nohw_xor(t59, t63);
636
517k
  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
637
517k
  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
638
517k
  aes_word_t t67 = aes_nohw_xor(t64, t65);
639
517k
  aes_word_t s3 = aes_nohw_xor(t53, t66);
640
517k
  aes_word_t s4 = aes_nohw_xor(t51, t66);
641
517k
  aes_word_t s5 = aes_nohw_xor(t47, t65);
642
517k
  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
643
517k
  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
644
645
517k
  batch->w[0] = s7;
646
517k
  batch->w[1] = s6;
647
517k
  batch->w[2] = s5;
648
517k
  batch->w[3] = s4;
649
517k
  batch->w[4] = s3;
650
517k
  batch->w[5] = s2;
651
517k
  batch->w[6] = s1;
652
517k
  batch->w[7] = s0;
653
517k
}
654
655
// aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES
656
// S-box, defined in FIPS PUB 197, section 5.1.1, step 2.
657
8.40k
static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) {
658
8.40k
  aes_word_t a0 = batch->w[0];
659
8.40k
  aes_word_t a1 = batch->w[1];
660
8.40k
  aes_word_t a2 = batch->w[2];
661
8.40k
  aes_word_t a3 = batch->w[3];
662
8.40k
  aes_word_t a4 = batch->w[4];
663
8.40k
  aes_word_t a5 = batch->w[5];
664
8.40k
  aes_word_t a6 = batch->w[6];
665
8.40k
  aes_word_t a7 = batch->w[7];
666
667
  // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant
668
  // [1 0 0 0 1 1 1 1].
669
8.40k
  aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7));
670
8.40k
  aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0));
671
8.40k
  aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1));
672
8.40k
  aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2));
673
8.40k
  aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3));
674
8.40k
  aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4));
675
8.40k
  aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5));
676
8.40k
  aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6));
677
678
  // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant,
679
  // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.)
680
8.40k
  batch->w[0] = aes_nohw_not(b0);
681
8.40k
  batch->w[1] = b1;
682
8.40k
  batch->w[2] = aes_nohw_not(b2);
683
8.40k
  batch->w[3] = b3;
684
8.40k
  batch->w[4] = b4;
685
8.40k
  batch->w[5] = b5;
686
8.40k
  batch->w[6] = b6;
687
8.40k
  batch->w[7] = b7;
688
8.40k
}
689
690
4.20k
static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) {
691
  // We implement the inverse S-box using the forwards implementation with the
692
  // technique described in https://www.bearssl.org/constanttime.html#aes.
693
  //
694
  // The forwards S-box inverts its input and applies an affine transformation:
695
  // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then:
696
  //
697
  //   InvS(x) = Inv(InvA(x)).
698
  //           = InvA(S(InvA(x)))
699
4.20k
  aes_nohw_sub_bytes_inv_affine(batch);
700
4.20k
  aes_nohw_sub_bytes(batch);
701
4.20k
  aes_nohw_sub_bytes_inv_affine(batch);
702
4.20k
}
703
704
// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
705
// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
706
// constant shift counts in the SSE2 implementation.
707
#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
708
10.0M
  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
709
10.0M
               aes_nohw_shift_left((v), 16 - (n)*4)))
710
711
413k
static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
712
3.72M
  for (size_t i = 0; i < 8; i++) {
713
3.30M
    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
714
3.30M
    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
715
3.30M
    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
716
3.30M
    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
717
3.30M
    row1 = aes_nohw_rotate_cols_right(row1, 1);
718
3.30M
    row2 = aes_nohw_rotate_cols_right(row2, 2);
719
3.30M
    row3 = aes_nohw_rotate_cols_right(row3, 3);
720
3.30M
    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
721
3.30M
  }
722
413k
}
723
724
4.20k
static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) {
725
37.8k
  for (size_t i = 0; i < 8; i++) {
726
33.6k
    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
727
33.6k
    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
728
33.6k
    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
729
33.6k
    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
730
33.6k
    row1 = aes_nohw_rotate_cols_right(row1, 3);
731
33.6k
    row2 = aes_nohw_rotate_cols_right(row2, 2);
732
33.6k
    row3 = aes_nohw_rotate_cols_right(row3, 1);
733
33.6k
    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
734
33.6k
  }
735
4.20k
}
736
737
// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
738
// down by one.
739
3.15M
static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
740
3.15M
#if defined(OPENSSL_SSE2)
741
3.15M
  return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
742
#elif defined(OPENSSL_64_BIT)
743
  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
744
         ((v << 12) & UINT64_C(0xf000f000f000f000));
745
#else
746
  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
747
#endif
748
3.15M
}
749
750
// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
751
// by two.
752
3.12M
static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
753
3.12M
#if defined(OPENSSL_SSE2)
754
3.12M
  return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
755
#elif defined(OPENSSL_64_BIT)
756
  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
757
         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
758
#else
759
  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
760
#endif
761
3.12M
}
762
763
386k
static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
764
  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
765
386k
  aes_word_t a0 = batch->w[0];
766
386k
  aes_word_t a1 = batch->w[1];
767
386k
  aes_word_t a2 = batch->w[2];
768
386k
  aes_word_t a3 = batch->w[3];
769
386k
  aes_word_t a4 = batch->w[4];
770
386k
  aes_word_t a5 = batch->w[5];
771
386k
  aes_word_t a6 = batch->w[6];
772
386k
  aes_word_t a7 = batch->w[7];
773
774
386k
  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
775
386k
  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
776
386k
  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
777
386k
  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
778
386k
  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
779
386k
  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
780
386k
  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
781
386k
  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
782
386k
  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
783
386k
  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
784
386k
  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
785
386k
  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
786
386k
  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
787
386k
  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
788
386k
  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
789
386k
  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
790
791
386k
  batch->w[0] =
792
386k
      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
793
386k
  batch->w[1] =
794
386k
      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
795
386k
                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
796
386k
  batch->w[2] =
797
386k
      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
798
386k
  batch->w[3] =
799
386k
      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
800
386k
                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
801
386k
  batch->w[4] =
802
386k
      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
803
386k
                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
804
386k
  batch->w[5] =
805
386k
      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
806
386k
  batch->w[6] =
807
386k
      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
808
386k
  batch->w[7] =
809
386k
      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
810
386k
}
811
812
3.78k
static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) {
813
3.78k
  aes_word_t a0 = batch->w[0];
814
3.78k
  aes_word_t a1 = batch->w[1];
815
3.78k
  aes_word_t a2 = batch->w[2];
816
3.78k
  aes_word_t a3 = batch->w[3];
817
3.78k
  aes_word_t a4 = batch->w[4];
818
3.78k
  aes_word_t a5 = batch->w[5];
819
3.78k
  aes_word_t a6 = batch->w[6];
820
3.78k
  aes_word_t a7 = batch->w[7];
821
822
  // bsaes-x86_64.pl describes the following decomposition of the inverse
823
  // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler
824
  // multiplication.
825
  //
826
  // | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
827
  // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
828
  // | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
829
  // | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
830
  //
831
  // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described
832
  // by the following bit equations:
833
  //
834
  //   b0 = a6
835
  //   b1 = a6 ^ a7
836
  //   b2 = a0 ^ a7
837
  //   b3 = a1 ^ a6
838
  //   b4 = a2 ^ a6 ^ a7
839
  //   b5 = a3 ^ a7
840
  //   b6 = a4
841
  //   b7 = a5
842
  //
843
  // Each coefficient is given by:
844
  //
845
  //   b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij
846
  //
847
  // We combine the two equations below. Note a_i(j+2) is a row rotation.
848
3.78k
  aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0));
849
3.78k
  aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1));
850
3.78k
  aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2));
851
3.78k
  aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3));
852
3.78k
  aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4));
853
3.78k
  aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5));
854
3.78k
  aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6));
855
3.78k
  aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7));
856
857
3.78k
  batch->w[0] = aes_nohw_xor(a0, a6_r6);
858
3.78k
  batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7));
859
3.78k
  batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7));
860
3.78k
  batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6));
861
3.78k
  batch->w[4] =
862
3.78k
      aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7));
863
3.78k
  batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7));
864
3.78k
  batch->w[6] = aes_nohw_xor(a6, a4_r4);
865
3.78k
  batch->w[7] = aes_nohw_xor(a7, a5_r5);
866
867
  // Apply the [02 03 01 01] matrix, which is just MixColumns.
868
3.78k
  aes_nohw_mix_columns(batch);
869
3.78k
}
870
871
static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
872
30.6k
                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
873
30.6k
  aes_nohw_add_round_key(batch, &key->keys[0]);
874
413k
  for (size_t i = 1; i < num_rounds; i++) {
875
383k
    aes_nohw_sub_bytes(batch);
876
383k
    aes_nohw_shift_rows(batch);
877
383k
    aes_nohw_mix_columns(batch);
878
383k
    aes_nohw_add_round_key(batch, &key->keys[i]);
879
383k
  }
880
30.6k
  aes_nohw_sub_bytes(batch);
881
30.6k
  aes_nohw_shift_rows(batch);
882
30.6k
  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
883
30.6k
}
884
885
static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key,
886
412
                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
887
412
  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
888
412
  aes_nohw_inv_shift_rows(batch);
889
412
  aes_nohw_inv_sub_bytes(batch);
890
4.20k
  for (size_t i = num_rounds - 1; i > 0; i--) {
891
3.78k
    aes_nohw_add_round_key(batch, &key->keys[i]);
892
3.78k
    aes_nohw_inv_mix_columns(batch);
893
3.78k
    aes_nohw_inv_shift_rows(batch);
894
3.78k
    aes_nohw_inv_sub_bytes(batch);
895
3.78k
  }
896
412
  aes_nohw_add_round_key(batch, &key->keys[0]);
897
412
}
898
899
900
// Key schedule.
901
902
static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
903
30.6k
                                       const AES_KEY *key) {
904
474k
  for (size_t i = 0; i <= key->rounds; i++) {
905
    // Copy the round key into each block in the batch.
906
3.99M
    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
907
3.55M
      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
908
3.55M
      memcpy(tmp, key->rd_key + 4 * i, 16);
909
3.55M
      aes_nohw_batch_set(&out->keys[i], tmp, j);
910
3.55M
    }
911
444k
    aes_nohw_transpose(&out->keys[i]);
912
444k
  }
913
30.6k
}
914
915
static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
916
                                          0x20, 0x40, 0x80, 0x1b, 0x36};
917
918
// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
919
// |rcon|, stored in a |aes_word_t|.
920
57.7k
static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
921
57.7k
  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
922
57.7k
#if defined(OPENSSL_SSE2)
923
57.7k
  return _mm_set_epi32(0, 0, 0, rcon);
924
#else
925
  return ((aes_word_t)rcon);
926
#endif
927
57.7k
}
928
929
static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
930
100k
                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
931
100k
  AES_NOHW_BATCH batch;
932
100k
  memset(&batch, 0, sizeof(batch));
933
100k
  aes_nohw_batch_set(&batch, in, 0);
934
100k
  aes_nohw_transpose(&batch);
935
100k
  aes_nohw_sub_bytes(&batch);
936
100k
  aes_nohw_transpose(&batch);
937
100k
  aes_nohw_batch_get(&batch, out, 0);
938
100k
}
939
940
846
static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
941
846
  key->rounds = 10;
942
943
846
  aes_word_t block[AES_NOHW_BLOCK_WORDS];
944
846
  aes_nohw_compact_block(block, in);
945
846
  memcpy(key->rd_key, block, 16);
946
947
9.30k
  for (size_t i = 1; i <= 10; i++) {
948
8.46k
    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
949
8.46k
    aes_nohw_sub_block(sub, block);
950
8.46k
    uint8_t rcon = aes_nohw_rcon[i - 1];
951
16.9k
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
952
      // Incorporate |rcon| and the transformed word into the first word.
953
8.46k
      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
954
8.46k
      block[j] = aes_nohw_xor(
955
8.46k
          block[j],
956
8.46k
          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
957
      // Propagate to the remaining words. Note this is reordered from the usual
958
      // formulation to avoid needing masks.
959
8.46k
      aes_word_t v = block[j];
960
8.46k
      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
961
8.46k
      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
962
8.46k
      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
963
8.46k
    }
964
8.46k
    memcpy(key->rd_key + 4 * i, block, 16);
965
8.46k
  }
966
846
}
967
968
2
static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) {
969
2
  key->rounds = 12;
970
971
2
  aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS];
972
2
  aes_word_t *block1 = storage1, *block2 = storage2;
973
974
  // AES-192's key schedule is complex because each key schedule iteration
975
  // produces six words, but we compute on blocks and each block is four words.
976
  // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time.
977
  // We loop below every three blocks or two key schedule iterations.
978
  //
979
  // On entry to the loop, |block1| and the first half of |block2| contain the
980
  // previous key schedule iteration. |block1| has been written to |key|, but
981
  // |block2| has not as it is incomplete.
982
2
  aes_nohw_compact_block(block1, in);
983
2
  memcpy(key->rd_key, block1, 16);
984
985
2
  uint8_t half_block[16] = {0};
986
2
  memcpy(half_block, in + 16, 8);
987
2
  aes_nohw_compact_block(block2, half_block);
988
989
10
  for (size_t i = 0; i < 4; i++) {
990
8
    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
991
8
    aes_nohw_sub_block(sub, block2);
992
8
    uint8_t rcon = aes_nohw_rcon[2 * i];
993
16
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
994
      // Compute the first two words of the next key schedule iteration, which
995
      // go in the second half of |block2|. The first two words of the previous
996
      // iteration are in the first half of |block1|. Apply |rcon| here too
997
      // because the shifts match.
998
8
      block2[j] = aes_nohw_or(
999
8
          block2[j],
1000
8
          aes_nohw_shift_left(
1001
8
              aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8));
1002
      // Incorporate the transformed word and propagate. Note the last word of
1003
      // the previous iteration corresponds to the second word of |copy|. This
1004
      // is incorporated into the first word of the next iteration, or the third
1005
      // word of |block2|.
1006
8
      block2[j] = aes_nohw_xor(
1007
8
          block2[j], aes_nohw_and(aes_nohw_shift_left(
1008
8
                                      aes_nohw_rotate_rows_down(sub[j]), 4),
1009
8
                                  AES_NOHW_COL2_MASK));
1010
8
      block2[j] = aes_nohw_xor(
1011
8
          block2[j],
1012
8
          aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK));
1013
1014
      // Compute the remaining four words, which fill |block1|. Begin by moving
1015
      // the corresponding words of the previous iteration: the second half of
1016
      // |block1| and the first half of |block2|.
1017
8
      block1[j] = aes_nohw_shift_right(block1[j], 8);
1018
8
      block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8));
1019
      // Incorporate the second word, computed previously in |block2|, and
1020
      // propagate.
1021
8
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1022
8
      aes_word_t v = block1[j];
1023
8
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1024
8
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1025
8
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1026
8
    }
1027
1028
    // This completes two round keys. Note half of |block2| was computed in the
1029
    // previous loop iteration but was not yet output.
1030
8
    memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16);
1031
8
    memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16);
1032
1033
8
    aes_nohw_sub_block(sub, block1);
1034
8
    rcon = aes_nohw_rcon[2 * i + 1];
1035
16
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1036
      // Compute the first four words of the next key schedule iteration in
1037
      // |block2|. Begin by moving the corresponding words of the previous
1038
      // iteration: the second half of |block2| and the first half of |block1|.
1039
8
      block2[j] = aes_nohw_shift_right(block2[j], 8);
1040
8
      block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8));
1041
      // Incorporate rcon and the transformed word. Note the last word of the
1042
      // previous iteration corresponds to the last word of |copy|.
1043
8
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j));
1044
8
      block2[j] = aes_nohw_xor(
1045
8
          block2[j],
1046
8
          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
1047
      // Propagate to the remaining words.
1048
8
      aes_word_t v = block2[j];
1049
8
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1050
8
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1051
8
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1052
1053
      // Compute the last two words, which go in the first half of |block1|. The
1054
      // last two words of the previous iteration are in the second half of
1055
      // |block1|.
1056
8
      block1[j] = aes_nohw_shift_right(block1[j], 8);
1057
      // Propagate blocks and mask off the excess.
1058
8
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1059
8
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4));
1060
8
      block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK);
1061
8
    }
1062
1063
    // |block2| has a complete round key. |block1| will be completed in the next
1064
    // iteration.
1065
8
    memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16);
1066
1067
    // Swap blocks to restore the invariant.
1068
8
    aes_word_t *tmp = block1;
1069
8
    block1 = block2;
1070
8
    block2 = tmp;
1071
8
  }
1072
2
}
1073
1074
7.04k
static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
1075
7.04k
  key->rounds = 14;
1076
1077
  // Each key schedule iteration produces two round keys.
1078
7.04k
  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
1079
7.04k
  aes_nohw_compact_block(block1, in);
1080
7.04k
  memcpy(key->rd_key, block1, 16);
1081
1082
7.04k
  aes_nohw_compact_block(block2, in + 16);
1083
7.04k
  memcpy(key->rd_key + 4, block2, 16);
1084
1085
49.3k
  for (size_t i = 2; i <= 14; i += 2) {
1086
49.3k
    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
1087
49.3k
    aes_nohw_sub_block(sub, block2);
1088
49.3k
    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
1089
98.6k
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1090
      // Incorporate |rcon| and the transformed word into the first word.
1091
49.3k
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
1092
49.3k
      block1[j] = aes_nohw_xor(
1093
49.3k
          block1[j],
1094
49.3k
          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
1095
      // Propagate to the remaining words.
1096
49.3k
      aes_word_t v = block1[j];
1097
49.3k
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1098
49.3k
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1099
49.3k
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1100
49.3k
    }
1101
49.3k
    memcpy(key->rd_key + 4 * i, block1, 16);
1102
1103
49.3k
    if (i == 14) {
1104
7.04k
      break;
1105
7.04k
    }
1106
1107
42.2k
    aes_nohw_sub_block(sub, block1);
1108
84.5k
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1109
      // Incorporate the transformed word into the first word.
1110
42.2k
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
1111
      // Propagate to the remaining words.
1112
42.2k
      aes_word_t v = block2[j];
1113
42.2k
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1114
42.2k
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1115
42.2k
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1116
42.2k
    }
1117
42.2k
    memcpy(key->rd_key + 4 * (i + 1), block2, 16);
1118
42.2k
  }
1119
7.04k
}
1120
1121
1122
// External API.
1123
1124
int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
1125
7.89k
                             AES_KEY *aeskey) {
1126
7.89k
  switch (bits) {
1127
846
    case 128:
1128
846
      aes_nohw_setup_key_128(aeskey, key);
1129
846
      return 0;
1130
2
    case 192:
1131
2
      aes_nohw_setup_key_192(aeskey, key);
1132
2
      return 0;
1133
7.04k
    case 256:
1134
7.04k
      aes_nohw_setup_key_256(aeskey, key);
1135
7.04k
      return 0;
1136
7.89k
  }
1137
0
  return 1;
1138
7.89k
}
1139
1140
int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
1141
120
                             AES_KEY *aeskey) {
1142
120
  return aes_nohw_set_encrypt_key(key, bits, aeskey);
1143
120
}
1144
1145
26.5k
void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
1146
26.5k
  AES_NOHW_SCHEDULE sched;
1147
26.5k
  aes_nohw_expand_round_keys(&sched, key);
1148
26.5k
  AES_NOHW_BATCH batch;
1149
26.5k
  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1150
26.5k
  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1151
26.5k
  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1152
26.5k
}
1153
1154
392
void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
1155
392
  AES_NOHW_SCHEDULE sched;
1156
392
  aes_nohw_expand_round_keys(&sched, key);
1157
392
  AES_NOHW_BATCH batch;
1158
392
  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1159
392
  aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1160
392
  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1161
392
}
1162
1163
static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
1164
13.7k
                                      const uint8_t b[16]) {
1165
27.4k
  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
1166
13.7k
    aes_word_t x, y;
1167
13.7k
    memcpy(&x, a + i, sizeof(aes_word_t));
1168
13.7k
    memcpy(&y, b + i, sizeof(aes_word_t));
1169
13.7k
    x = aes_nohw_xor(x, y);
1170
13.7k
    memcpy(out + i, &x, sizeof(aes_word_t));
1171
13.7k
  }
1172
13.7k
}
1173
1174
void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
1175
                                   size_t blocks, const AES_KEY *key,
1176
3.63k
                                   const uint8_t ivec[16]) {
1177
3.63k
  if (blocks == 0) {
1178
0
    return;
1179
0
  }
1180
1181
3.63k
  AES_NOHW_SCHEDULE sched;
1182
3.63k
  aes_nohw_expand_round_keys(&sched, key);
1183
1184
  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
1185
3.63k
  alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
1186
3.63k
  alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
1187
32.6k
  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1188
29.0k
    memcpy(ivs + 16 * i, ivec, 16);
1189
29.0k
  }
1190
1191
3.63k
  uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
1192
3.98k
  for (;;) {
1193
    // Update counters.
1194
35.8k
    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1195
31.9k
      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
1196
31.9k
    }
1197
1198
3.98k
    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1199
3.98k
    AES_NOHW_BATCH batch;
1200
3.98k
    aes_nohw_to_batch(&batch, ivs, todo);
1201
3.98k
    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1202
3.98k
    aes_nohw_from_batch(enc_ivs, todo, &batch);
1203
1204
17.6k
    for (size_t i = 0; i < todo; i++) {
1205
13.6k
      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
1206
13.6k
    }
1207
1208
3.98k
    blocks -= todo;
1209
3.98k
    if (blocks == 0) {
1210
3.63k
      break;
1211
3.63k
    }
1212
1213
358
    in += 16 * AES_NOHW_BATCH_SIZE;
1214
358
    out += 16 * AES_NOHW_BATCH_SIZE;
1215
358
    ctr += AES_NOHW_BATCH_SIZE;
1216
358
  }
1217
3.63k
}
1218
1219
void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
1220
79
                          const AES_KEY *key, uint8_t *ivec, const int enc) {
1221
79
  assert(len % 16 == 0);
1222
79
  size_t blocks = len / 16;
1223
79
  if (blocks == 0) {
1224
0
    return;
1225
0
  }
1226
1227
79
  AES_NOHW_SCHEDULE sched;
1228
79
  aes_nohw_expand_round_keys(&sched, key);
1229
79
  alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16];
1230
79
  memcpy(iv, ivec, 16);
1231
1232
79
  if (enc) {
1233
    // CBC encryption is not parallelizable.
1234
119
    while (blocks > 0) {
1235
60
      aes_nohw_xor_block(iv, iv, in);
1236
1237
60
      AES_NOHW_BATCH batch;
1238
60
      aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1);
1239
60
      aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1240
60
      aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1241
1242
60
      memcpy(iv, out, 16);
1243
1244
60
      in += 16;
1245
60
      out += 16;
1246
60
      blocks--;
1247
60
    }
1248
59
    memcpy(ivec, iv, 16);
1249
59
    return;
1250
59
  }
1251
1252
20
  for (;;) {
1253
20
    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1254
    // Make a copy of the input so we can decrypt in-place.
1255
20
    alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16];
1256
20
    memcpy(copy, in, todo * 16);
1257
1258
20
    AES_NOHW_BATCH batch;
1259
20
    aes_nohw_to_batch(&batch, in, todo);
1260
20
    aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1261
20
    aes_nohw_from_batch(out, todo, &batch);
1262
1263
20
    aes_nohw_xor_block(out, out, iv);
1264
60
    for (size_t i = 1; i < todo; i++) {
1265
40
      aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1));
1266
40
    }
1267
1268
    // Save the last block as the IV.
1269
20
    memcpy(iv, copy + 16 * (todo - 1), 16);
1270
1271
20
    blocks -= todo;
1272
20
    if (blocks == 0) {
1273
20
      break;
1274
20
    }
1275
1276
0
    in += 16 * AES_NOHW_BATCH_SIZE;
1277
0
    out += 16 * AES_NOHW_BATCH_SIZE;
1278
0
  }
1279
1280
20
  memcpy(ivec, iv, 16);
1281
20
}