Coverage Report

Created: 2024-11-21 06:47

/src/boringssl/crypto/fipsmodule/aes/aes_nohw.c.inc
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2019, Google Inc.
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15
#include <openssl/aes.h>
16
17
#include <assert.h>
18
#include <string.h>
19
20
#include "../../internal.h"
21
#include "internal.h"
22
23
#if defined(OPENSSL_SSE2)
24
#include <emmintrin.h>
25
#endif
26
27
28
// This file contains a constant-time implementation of AES, bitsliced with
29
// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
30
// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
31
//
32
// This implementation is based on the algorithms described in the following
33
// references:
34
// - https://bearssl.org/constanttime.html#aes
35
// - https://eprint.iacr.org/2009/129.pdf
36
// - https://eprint.iacr.org/2009/191.pdf
37
38
39
// Word operations.
40
//
41
// An aes_word_t is the word used for this AES implementation. Throughout this
42
// file, bits and bytes are ordered little-endian, though "left" and "right"
43
// shifts match the operations themselves, which makes them reversed in a
44
// little-endian, left-to-right reading.
45
//
46
// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
47
// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
48
// bits each, each corresponding to a byte in an AES block in column-major
49
// order (AES's byte order). We refer to these as "logical bytes". Note, in the
50
// 32-bit and 64-bit implementations, they are smaller than a byte. (The
51
// contents of a logical byte will be described later.)
52
//
53
// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
54
// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
55
// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
56
// value ranges from 0 to 15 independent of |aes_word_t| and
57
// |AES_NOHW_BATCH_SIZE|.
58
//
59
// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
60
// uses row-major order. Matching the AES order was easier to reason about, and
61
// we do not have PSHUFB available to arbitrarily permute bytes.
62
63
#if defined(OPENSSL_SSE2)
64
typedef __m128i aes_word_t;
65
// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
66
// MSVC, so we define a constant.
67
#define AES_NOHW_WORD_SIZE 16
68
0
#define AES_NOHW_BATCH_SIZE 8
69
#define AES_NOHW_ROW0_MASK \
70
0
  _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
71
#define AES_NOHW_ROW1_MASK \
72
0
  _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
73
#define AES_NOHW_ROW2_MASK \
74
0
  _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
75
#define AES_NOHW_ROW3_MASK \
76
0
  _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
77
#define AES_NOHW_COL01_MASK \
78
0
  _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
79
#define AES_NOHW_COL2_MASK \
80
0
  _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
81
#define AES_NOHW_COL3_MASK \
82
0
  _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
83
84
0
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
85
0
  return _mm_and_si128(a, b);
86
0
}
87
88
0
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
89
0
  return _mm_or_si128(a, b);
90
0
}
91
92
0
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
93
0
  return _mm_xor_si128(a, b);
94
0
}
95
96
0
static inline aes_word_t aes_nohw_not(aes_word_t a) {
97
0
  return _mm_xor_si128(
98
0
      a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
99
0
}
100
101
// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
102
// must be constants.
103
#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
104
0
  _mm_slli_si128((a), (i))
105
#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
106
0
  _mm_srli_si128((a), (i))
107
#else  // !OPENSSL_SSE2
108
#if defined(OPENSSL_64_BIT)
109
typedef uint64_t aes_word_t;
110
#define AES_NOHW_WORD_SIZE 8
111
#define AES_NOHW_BATCH_SIZE 4
112
#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
113
#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
114
#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
115
#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
116
#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
117
#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
118
#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
119
#else  // !OPENSSL_64_BIT
120
typedef uint32_t aes_word_t;
121
#define AES_NOHW_WORD_SIZE 4
122
#define AES_NOHW_BATCH_SIZE 2
123
#define AES_NOHW_ROW0_MASK 0x03030303
124
#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
125
#define AES_NOHW_ROW2_MASK 0x30303030
126
#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
127
#define AES_NOHW_COL01_MASK 0x0000ffff
128
#define AES_NOHW_COL2_MASK 0x00ff0000
129
#define AES_NOHW_COL3_MASK 0xff000000
130
#endif  // OPENSSL_64_BIT
131
132
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
133
  return a & b;
134
}
135
136
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
137
  return a | b;
138
}
139
140
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
141
  return a ^ b;
142
}
143
144
static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
145
146
static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
147
  return a << (i * AES_NOHW_BATCH_SIZE);
148
}
149
150
static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
151
  return a >> (i * AES_NOHW_BATCH_SIZE);
152
}
153
#endif  // OPENSSL_SSE2
154
155
static_assert(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
156
              "batch size does not match word size");
157
static_assert(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
158
              "AES_NOHW_WORD_SIZE is incorrect");
159
160
161
// Block representations.
162
//
163
// This implementation uses three representations for AES blocks. First, the
164
// public API represents blocks as uint8_t[16] in the usual way. Second, most
165
// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
166
// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
167
// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
168
// bars divide logical bytes):
169
//
170
//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
171
//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
172
//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
173
//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
174
//   ...
175
//
176
// Finally, an individual block may be stored as an intermediate form in an
177
// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
178
// block, so that block[0]'s ith logical byte contains least-significant
179
// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
180
// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
181
// "compacting" the block. Note this is no-op with 128-bit words because then
182
// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
183
// words, one block would be stored in two words:
184
//
185
//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
186
//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
187
//
188
// Observe that the distances between corresponding bits in bitsliced and
189
// compact bit orders match. If we line up corresponding words of each block,
190
// the bitsliced and compact representations may be converted by tranposing bits
191
// in corresponding logical bytes. Continuing the 64-bit example:
192
//
193
//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
194
//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
195
//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
196
//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
197
//
198
//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
199
//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
200
//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
201
//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
202
//
203
// Note also that bitwise operations and (logical) byte permutations on an
204
// |aes_word_t| work equally for the bitsliced and compact words.
205
//
206
// We use the compact form in the |AES_KEY| representation to save work
207
// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
208
// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
209
// before or after |aes_nohw_transpose|.
210
211
0
#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
212
213
// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
214
// specified, it is in bitsliced form.
215
typedef struct {
216
  aes_word_t w[8];
217
} AES_NOHW_BATCH;
218
219
// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
220
// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
221
// |AES_KEY|s so it should not be used as a long-term key representation.
222
typedef struct {
223
  // keys is an array of batches, one for each round key. Each batch stores
224
  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
225
  AES_NOHW_BATCH keys[AES_MAXNR + 1];
226
} AES_NOHW_SCHEDULE;
227
228
// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
229
// compact form.
230
static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
231
                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
232
0
                                      size_t i) {
233
  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
234
  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
235
  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
236
  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
237
  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
238
  // will be correctly placed.)
239
0
  assert(i < AES_NOHW_BATCH_SIZE);
240
0
#if defined(OPENSSL_SSE2)
241
0
  batch->w[i] = in[0];
242
#elif defined(OPENSSL_64_BIT)
243
  batch->w[i] = in[0];
244
  batch->w[i + 4] = in[1];
245
#else
246
  batch->w[i] = in[0];
247
  batch->w[i + 2] = in[1];
248
  batch->w[i + 4] = in[2];
249
  batch->w[i + 6] = in[3];
250
#endif
251
0
}
252
253
// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
254
// compact form.
255
static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
256
                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
257
0
                                      size_t i) {
258
0
  assert(i < AES_NOHW_BATCH_SIZE);
259
0
#if defined(OPENSSL_SSE2)
260
0
  out[0] = batch->w[i];
261
#elif defined(OPENSSL_64_BIT)
262
  out[0] = batch->w[i];
263
  out[1] = batch->w[i + 4];
264
#else
265
  out[0] = batch->w[i];
266
  out[1] = batch->w[i + 2];
267
  out[2] = batch->w[i + 4];
268
  out[3] = batch->w[i + 6];
269
#endif
270
0
}
271
272
#if !defined(OPENSSL_SSE2)
273
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
274
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
275
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
276
                                             aes_word_t shift) {
277
  // See
278
  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
279
  aes_word_t b = (a ^ (a >> shift)) & mask;
280
  return a ^ b ^ (b << shift);
281
}
282
283
// In the 32-bit and 64-bit implementations, a block spans multiple words.
284
// |aes_nohw_compact_block| must permute bits across different words. First we
285
// implement |aes_nohw_compact_word| which performs a smaller version of the
286
// transformation which stays within a single word.
287
//
288
// These transformations are generalizations of the output of
289
// http://programming.sirrida.de/calcperm.php on smaller inputs.
290
#if defined(OPENSSL_64_BIT)
291
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
292
  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
293
  // quartets of those chunks:
294
  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
295
  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
296
  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
297
  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
298
  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
299
  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
300
  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
301
  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
302
  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
303
  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
304
  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
305
  return a;
306
}
307
308
static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
309
  // Reverse the steps of |aes_nohw_uncompact_word|.
310
  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
311
  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
312
  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
313
  return a;
314
}
315
#else   // !OPENSSL_64_BIT
316
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
317
  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
318
  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
319
  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
320
  // Note:  0x00cc = 0b0000_0000_1100_1100
321
  //   0x00cc << 6 = 0b0011_0011_0000_0000
322
  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
323
  // Now we swap groups of four bits (still numbering by pairs):
324
  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
325
  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
326
  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
327
  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
328
  return a;
329
}
330
331
static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
332
  // Reverse the steps of |aes_nohw_uncompact_word|.
333
  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
334
  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
335
  return a;
336
}
337
338
static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
339
                                                uint8_t a2, uint8_t a3) {
340
  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
341
         ((uint32_t)a3 << 24);
342
}
343
#endif  // OPENSSL_64_BIT
344
#endif  // !OPENSSL_SSE2
345
346
static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
347
0
                                          const uint8_t in[16]) {
348
0
  memcpy(out, in, 16);
349
0
#if defined(OPENSSL_SSE2)
350
  // No conversions needed.
351
#elif defined(OPENSSL_64_BIT)
352
  uint64_t a0 = aes_nohw_compact_word(out[0]);
353
  uint64_t a1 = aes_nohw_compact_word(out[1]);
354
  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
355
  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
356
#else
357
  uint32_t a0 = aes_nohw_compact_word(out[0]);
358
  uint32_t a1 = aes_nohw_compact_word(out[1]);
359
  uint32_t a2 = aes_nohw_compact_word(out[2]);
360
  uint32_t a3 = aes_nohw_compact_word(out[3]);
361
  // Note clang, when building for ARM Thumb2, will sometimes miscompile
362
  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
363
  // without optimizations. This bug was introduced in
364
  // https://reviews.llvm.org/rL340261 and fixed in
365
  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
366
  out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3);
367
  out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
368
  out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
369
  out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
370
#endif
371
0
}
372
373
static inline void aes_nohw_uncompact_block(
374
0
    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
375
0
#if defined(OPENSSL_SSE2)
376
0
  memcpy(out, in, 16);  // No conversions needed.
377
#elif defined(OPENSSL_64_BIT)
378
  uint64_t a0 = in[0];
379
  uint64_t a1 = in[1];
380
  uint64_t b0 =
381
      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
382
  uint64_t b1 =
383
      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
384
  memcpy(out, &b0, 8);
385
  memcpy(out + 8, &b1, 8);
386
#else
387
  uint32_t a0 = in[0];
388
  uint32_t a1 = in[1];
389
  uint32_t a2 = in[2];
390
  uint32_t a3 = in[3];
391
  // Note clang, when building for ARM Thumb2, will sometimes miscompile
392
  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
393
  // without optimizations. This bug was introduced in
394
  // https://reviews.llvm.org/rL340261 and fixed in
395
  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
396
  uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3);
397
  uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
398
  uint32_t b2 =
399
      aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
400
  uint32_t b3 =
401
      aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
402
  b0 = aes_nohw_uncompact_word(b0);
403
  b1 = aes_nohw_uncompact_word(b1);
404
  b2 = aes_nohw_uncompact_word(b2);
405
  b3 = aes_nohw_uncompact_word(b3);
406
  memcpy(out, &b0, 4);
407
  memcpy(out + 4, &b1, 4);
408
  memcpy(out + 8, &b2, 4);
409
  memcpy(out + 12, &b3, 4);
410
#endif
411
0
}
412
413
// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
414
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
415
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
416
// is repeated to the full width of |aes_word_t|.
417
#if defined(OPENSSL_SSE2)
418
// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
419
// constant shift values.
420
#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
421
                           /* uint32_t */ mask, /* const */ shift)        \
422
0
  do {                                                                    \
423
0
    __m128i swap =                                                        \
424
0
        _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
425
0
                      _mm_set_epi32((mask), (mask), (mask), (mask)));     \
426
0
    *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift)));            \
427
0
    *(b) = _mm_xor_si128(*(b), swap);                                     \
428
0
                                                                          \
429
0
  } while (0)
430
#else
431
static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
432
                                      uint32_t mask, aes_word_t shift) {
433
#if defined(OPENSSL_64_BIT)
434
  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
435
#else
436
  aes_word_t mask_w = mask;
437
#endif
438
  // This is a variation on a delta swap.
439
  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
440
  *a ^= swap << shift;
441
  *b ^= swap;
442
}
443
#endif  // OPENSSL_SSE2
444
445
// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
446
// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
447
// and transposes each square.
448
0
static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
449
  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
450
0
  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
451
0
  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
452
0
  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
453
0
  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
454
455
0
#if AES_NOHW_BATCH_SIZE >= 4
456
  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
457
0
  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
458
0
  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
459
0
  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
460
0
  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
461
0
#endif
462
463
0
#if AES_NOHW_BATCH_SIZE >= 8
464
  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
465
0
  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
466
0
  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
467
0
  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
468
0
  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
469
0
#endif
470
0
}
471
472
// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
473
// |num_blocks| must be at most |AES_NOHW_BATCH|.
474
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
475
0
                              size_t num_blocks) {
476
  // Don't leave unused blocks uninitialized.
477
0
  memset(out, 0, sizeof(AES_NOHW_BATCH));
478
0
  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
479
0
  for (size_t i = 0; i < num_blocks; i++) {
480
0
    aes_word_t block[AES_NOHW_BLOCK_WORDS];
481
0
    aes_nohw_compact_block(block, in + 16 * i);
482
0
    aes_nohw_batch_set(out, block, i);
483
0
  }
484
485
0
  aes_nohw_transpose(out);
486
0
}
487
488
// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
489
// |num_blocks| must be at most |AES_NOHW_BATCH|.
490
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
491
0
                                const AES_NOHW_BATCH *batch) {
492
0
  AES_NOHW_BATCH copy = *batch;
493
0
  aes_nohw_transpose(&copy);
494
495
0
  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
496
0
  for (size_t i = 0; i < num_blocks; i++) {
497
0
    aes_word_t block[AES_NOHW_BLOCK_WORDS];
498
0
    aes_nohw_batch_get(&copy, block, i);
499
0
    aes_nohw_uncompact_block(out + 16 * i, block);
500
0
  }
501
0
}
502
503
504
// AES round steps.
505
506
static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
507
0
                                   const AES_NOHW_BATCH *key) {
508
0
  for (size_t i = 0; i < 8; i++) {
509
0
    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
510
0
  }
511
0
}
512
513
0
static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
514
  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
515
0
  aes_word_t x0 = batch->w[7];
516
0
  aes_word_t x1 = batch->w[6];
517
0
  aes_word_t x2 = batch->w[5];
518
0
  aes_word_t x3 = batch->w[4];
519
0
  aes_word_t x4 = batch->w[3];
520
0
  aes_word_t x5 = batch->w[2];
521
0
  aes_word_t x6 = batch->w[1];
522
0
  aes_word_t x7 = batch->w[0];
523
524
  // Figure 2, the top linear transformation.
525
0
  aes_word_t y14 = aes_nohw_xor(x3, x5);
526
0
  aes_word_t y13 = aes_nohw_xor(x0, x6);
527
0
  aes_word_t y9 = aes_nohw_xor(x0, x3);
528
0
  aes_word_t y8 = aes_nohw_xor(x0, x5);
529
0
  aes_word_t t0 = aes_nohw_xor(x1, x2);
530
0
  aes_word_t y1 = aes_nohw_xor(t0, x7);
531
0
  aes_word_t y4 = aes_nohw_xor(y1, x3);
532
0
  aes_word_t y12 = aes_nohw_xor(y13, y14);
533
0
  aes_word_t y2 = aes_nohw_xor(y1, x0);
534
0
  aes_word_t y5 = aes_nohw_xor(y1, x6);
535
0
  aes_word_t y3 = aes_nohw_xor(y5, y8);
536
0
  aes_word_t t1 = aes_nohw_xor(x4, y12);
537
0
  aes_word_t y15 = aes_nohw_xor(t1, x5);
538
0
  aes_word_t y20 = aes_nohw_xor(t1, x1);
539
0
  aes_word_t y6 = aes_nohw_xor(y15, x7);
540
0
  aes_word_t y10 = aes_nohw_xor(y15, t0);
541
0
  aes_word_t y11 = aes_nohw_xor(y20, y9);
542
0
  aes_word_t y7 = aes_nohw_xor(x7, y11);
543
0
  aes_word_t y17 = aes_nohw_xor(y10, y11);
544
0
  aes_word_t y19 = aes_nohw_xor(y10, y8);
545
0
  aes_word_t y16 = aes_nohw_xor(t0, y11);
546
0
  aes_word_t y21 = aes_nohw_xor(y13, y16);
547
0
  aes_word_t y18 = aes_nohw_xor(x0, y16);
548
549
  // Figure 3, the middle non-linear section.
550
0
  aes_word_t t2 = aes_nohw_and(y12, y15);
551
0
  aes_word_t t3 = aes_nohw_and(y3, y6);
552
0
  aes_word_t t4 = aes_nohw_xor(t3, t2);
553
0
  aes_word_t t5 = aes_nohw_and(y4, x7);
554
0
  aes_word_t t6 = aes_nohw_xor(t5, t2);
555
0
  aes_word_t t7 = aes_nohw_and(y13, y16);
556
0
  aes_word_t t8 = aes_nohw_and(y5, y1);
557
0
  aes_word_t t9 = aes_nohw_xor(t8, t7);
558
0
  aes_word_t t10 = aes_nohw_and(y2, y7);
559
0
  aes_word_t t11 = aes_nohw_xor(t10, t7);
560
0
  aes_word_t t12 = aes_nohw_and(y9, y11);
561
0
  aes_word_t t13 = aes_nohw_and(y14, y17);
562
0
  aes_word_t t14 = aes_nohw_xor(t13, t12);
563
0
  aes_word_t t15 = aes_nohw_and(y8, y10);
564
0
  aes_word_t t16 = aes_nohw_xor(t15, t12);
565
0
  aes_word_t t17 = aes_nohw_xor(t4, t14);
566
0
  aes_word_t t18 = aes_nohw_xor(t6, t16);
567
0
  aes_word_t t19 = aes_nohw_xor(t9, t14);
568
0
  aes_word_t t20 = aes_nohw_xor(t11, t16);
569
0
  aes_word_t t21 = aes_nohw_xor(t17, y20);
570
0
  aes_word_t t22 = aes_nohw_xor(t18, y19);
571
0
  aes_word_t t23 = aes_nohw_xor(t19, y21);
572
0
  aes_word_t t24 = aes_nohw_xor(t20, y18);
573
0
  aes_word_t t25 = aes_nohw_xor(t21, t22);
574
0
  aes_word_t t26 = aes_nohw_and(t21, t23);
575
0
  aes_word_t t27 = aes_nohw_xor(t24, t26);
576
0
  aes_word_t t28 = aes_nohw_and(t25, t27);
577
0
  aes_word_t t29 = aes_nohw_xor(t28, t22);
578
0
  aes_word_t t30 = aes_nohw_xor(t23, t24);
579
0
  aes_word_t t31 = aes_nohw_xor(t22, t26);
580
0
  aes_word_t t32 = aes_nohw_and(t31, t30);
581
0
  aes_word_t t33 = aes_nohw_xor(t32, t24);
582
0
  aes_word_t t34 = aes_nohw_xor(t23, t33);
583
0
  aes_word_t t35 = aes_nohw_xor(t27, t33);
584
0
  aes_word_t t36 = aes_nohw_and(t24, t35);
585
0
  aes_word_t t37 = aes_nohw_xor(t36, t34);
586
0
  aes_word_t t38 = aes_nohw_xor(t27, t36);
587
0
  aes_word_t t39 = aes_nohw_and(t29, t38);
588
0
  aes_word_t t40 = aes_nohw_xor(t25, t39);
589
0
  aes_word_t t41 = aes_nohw_xor(t40, t37);
590
0
  aes_word_t t42 = aes_nohw_xor(t29, t33);
591
0
  aes_word_t t43 = aes_nohw_xor(t29, t40);
592
0
  aes_word_t t44 = aes_nohw_xor(t33, t37);
593
0
  aes_word_t t45 = aes_nohw_xor(t42, t41);
594
0
  aes_word_t z0 = aes_nohw_and(t44, y15);
595
0
  aes_word_t z1 = aes_nohw_and(t37, y6);
596
0
  aes_word_t z2 = aes_nohw_and(t33, x7);
597
0
  aes_word_t z3 = aes_nohw_and(t43, y16);
598
0
  aes_word_t z4 = aes_nohw_and(t40, y1);
599
0
  aes_word_t z5 = aes_nohw_and(t29, y7);
600
0
  aes_word_t z6 = aes_nohw_and(t42, y11);
601
0
  aes_word_t z7 = aes_nohw_and(t45, y17);
602
0
  aes_word_t z8 = aes_nohw_and(t41, y10);
603
0
  aes_word_t z9 = aes_nohw_and(t44, y12);
604
0
  aes_word_t z10 = aes_nohw_and(t37, y3);
605
0
  aes_word_t z11 = aes_nohw_and(t33, y4);
606
0
  aes_word_t z12 = aes_nohw_and(t43, y13);
607
0
  aes_word_t z13 = aes_nohw_and(t40, y5);
608
0
  aes_word_t z14 = aes_nohw_and(t29, y2);
609
0
  aes_word_t z15 = aes_nohw_and(t42, y9);
610
0
  aes_word_t z16 = aes_nohw_and(t45, y14);
611
0
  aes_word_t z17 = aes_nohw_and(t41, y8);
612
613
  // Figure 4, bottom linear transformation.
614
0
  aes_word_t t46 = aes_nohw_xor(z15, z16);
615
0
  aes_word_t t47 = aes_nohw_xor(z10, z11);
616
0
  aes_word_t t48 = aes_nohw_xor(z5, z13);
617
0
  aes_word_t t49 = aes_nohw_xor(z9, z10);
618
0
  aes_word_t t50 = aes_nohw_xor(z2, z12);
619
0
  aes_word_t t51 = aes_nohw_xor(z2, z5);
620
0
  aes_word_t t52 = aes_nohw_xor(z7, z8);
621
0
  aes_word_t t53 = aes_nohw_xor(z0, z3);
622
0
  aes_word_t t54 = aes_nohw_xor(z6, z7);
623
0
  aes_word_t t55 = aes_nohw_xor(z16, z17);
624
0
  aes_word_t t56 = aes_nohw_xor(z12, t48);
625
0
  aes_word_t t57 = aes_nohw_xor(t50, t53);
626
0
  aes_word_t t58 = aes_nohw_xor(z4, t46);
627
0
  aes_word_t t59 = aes_nohw_xor(z3, t54);
628
0
  aes_word_t t60 = aes_nohw_xor(t46, t57);
629
0
  aes_word_t t61 = aes_nohw_xor(z14, t57);
630
0
  aes_word_t t62 = aes_nohw_xor(t52, t58);
631
0
  aes_word_t t63 = aes_nohw_xor(t49, t58);
632
0
  aes_word_t t64 = aes_nohw_xor(z4, t59);
633
0
  aes_word_t t65 = aes_nohw_xor(t61, t62);
634
0
  aes_word_t t66 = aes_nohw_xor(z1, t63);
635
0
  aes_word_t s0 = aes_nohw_xor(t59, t63);
636
0
  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
637
0
  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
638
0
  aes_word_t t67 = aes_nohw_xor(t64, t65);
639
0
  aes_word_t s3 = aes_nohw_xor(t53, t66);
640
0
  aes_word_t s4 = aes_nohw_xor(t51, t66);
641
0
  aes_word_t s5 = aes_nohw_xor(t47, t65);
642
0
  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
643
0
  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
644
645
0
  batch->w[0] = s7;
646
0
  batch->w[1] = s6;
647
0
  batch->w[2] = s5;
648
0
  batch->w[3] = s4;
649
0
  batch->w[4] = s3;
650
0
  batch->w[5] = s2;
651
0
  batch->w[6] = s1;
652
0
  batch->w[7] = s0;
653
0
}
654
655
// aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES
656
// S-box, defined in FIPS PUB 197, section 5.1.1, step 2.
657
0
static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) {
658
0
  aes_word_t a0 = batch->w[0];
659
0
  aes_word_t a1 = batch->w[1];
660
0
  aes_word_t a2 = batch->w[2];
661
0
  aes_word_t a3 = batch->w[3];
662
0
  aes_word_t a4 = batch->w[4];
663
0
  aes_word_t a5 = batch->w[5];
664
0
  aes_word_t a6 = batch->w[6];
665
0
  aes_word_t a7 = batch->w[7];
666
667
  // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant
668
  // [1 0 0 0 1 1 1 1].
669
0
  aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7));
670
0
  aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0));
671
0
  aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1));
672
0
  aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2));
673
0
  aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3));
674
0
  aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4));
675
0
  aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5));
676
0
  aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6));
677
678
  // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant,
679
  // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.)
680
0
  batch->w[0] = aes_nohw_not(b0);
681
0
  batch->w[1] = b1;
682
0
  batch->w[2] = aes_nohw_not(b2);
683
0
  batch->w[3] = b3;
684
0
  batch->w[4] = b4;
685
0
  batch->w[5] = b5;
686
0
  batch->w[6] = b6;
687
0
  batch->w[7] = b7;
688
0
}
689
690
0
static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) {
691
  // We implement the inverse S-box using the forwards implementation with the
692
  // technique described in https://www.bearssl.org/constanttime.html#aes.
693
  //
694
  // The forwards S-box inverts its input and applies an affine transformation:
695
  // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then:
696
  //
697
  //   InvS(x) = Inv(InvA(x)).
698
  //           = InvA(S(InvA(x)))
699
0
  aes_nohw_sub_bytes_inv_affine(batch);
700
0
  aes_nohw_sub_bytes(batch);
701
0
  aes_nohw_sub_bytes_inv_affine(batch);
702
0
}
703
704
// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
705
// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
706
// constant shift counts in the SSE2 implementation.
707
#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
708
0
  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
709
0
               aes_nohw_shift_left((v), 16 - (n)*4)))
710
711
0
static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
712
0
  for (size_t i = 0; i < 8; i++) {
713
0
    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
714
0
    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
715
0
    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
716
0
    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
717
0
    row1 = aes_nohw_rotate_cols_right(row1, 1);
718
0
    row2 = aes_nohw_rotate_cols_right(row2, 2);
719
0
    row3 = aes_nohw_rotate_cols_right(row3, 3);
720
0
    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
721
0
  }
722
0
}
723
724
0
static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) {
725
0
  for (size_t i = 0; i < 8; i++) {
726
0
    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
727
0
    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
728
0
    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
729
0
    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
730
0
    row1 = aes_nohw_rotate_cols_right(row1, 3);
731
0
    row2 = aes_nohw_rotate_cols_right(row2, 2);
732
0
    row3 = aes_nohw_rotate_cols_right(row3, 1);
733
0
    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
734
0
  }
735
0
}
736
737
// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
738
// down by one.
739
0
static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
740
0
#if defined(OPENSSL_SSE2)
741
0
  return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
742
#elif defined(OPENSSL_64_BIT)
743
  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
744
         ((v << 12) & UINT64_C(0xf000f000f000f000));
745
#else
746
  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
747
#endif
748
0
}
749
750
// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
751
// by two.
752
0
static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
753
0
#if defined(OPENSSL_SSE2)
754
0
  return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
755
#elif defined(OPENSSL_64_BIT)
756
  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
757
         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
758
#else
759
  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
760
#endif
761
0
}
762
763
0
static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
764
  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
765
0
  aes_word_t a0 = batch->w[0];
766
0
  aes_word_t a1 = batch->w[1];
767
0
  aes_word_t a2 = batch->w[2];
768
0
  aes_word_t a3 = batch->w[3];
769
0
  aes_word_t a4 = batch->w[4];
770
0
  aes_word_t a5 = batch->w[5];
771
0
  aes_word_t a6 = batch->w[6];
772
0
  aes_word_t a7 = batch->w[7];
773
774
0
  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
775
0
  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
776
0
  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
777
0
  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
778
0
  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
779
0
  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
780
0
  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
781
0
  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
782
0
  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
783
0
  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
784
0
  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
785
0
  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
786
0
  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
787
0
  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
788
0
  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
789
0
  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
790
791
0
  batch->w[0] =
792
0
      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
793
0
  batch->w[1] =
794
0
      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
795
0
                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
796
0
  batch->w[2] =
797
0
      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
798
0
  batch->w[3] =
799
0
      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
800
0
                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
801
0
  batch->w[4] =
802
0
      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
803
0
                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
804
0
  batch->w[5] =
805
0
      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
806
0
  batch->w[6] =
807
0
      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
808
0
  batch->w[7] =
809
0
      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
810
0
}
811
812
0
static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) {
813
0
  aes_word_t a0 = batch->w[0];
814
0
  aes_word_t a1 = batch->w[1];
815
0
  aes_word_t a2 = batch->w[2];
816
0
  aes_word_t a3 = batch->w[3];
817
0
  aes_word_t a4 = batch->w[4];
818
0
  aes_word_t a5 = batch->w[5];
819
0
  aes_word_t a6 = batch->w[6];
820
0
  aes_word_t a7 = batch->w[7];
821
822
  // bsaes-x86_64.pl describes the following decomposition of the inverse
823
  // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler
824
  // multiplication.
825
  //
826
  // | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
827
  // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
828
  // | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
829
  // | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
830
  //
831
  // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described
832
  // by the following bit equations:
833
  //
834
  //   b0 = a6
835
  //   b1 = a6 ^ a7
836
  //   b2 = a0 ^ a7
837
  //   b3 = a1 ^ a6
838
  //   b4 = a2 ^ a6 ^ a7
839
  //   b5 = a3 ^ a7
840
  //   b6 = a4
841
  //   b7 = a5
842
  //
843
  // Each coefficient is given by:
844
  //
845
  //   b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij
846
  //
847
  // We combine the two equations below. Note a_i(j+2) is a row rotation.
848
0
  aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0));
849
0
  aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1));
850
0
  aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2));
851
0
  aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3));
852
0
  aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4));
853
0
  aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5));
854
0
  aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6));
855
0
  aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7));
856
857
0
  batch->w[0] = aes_nohw_xor(a0, a6_r6);
858
0
  batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7));
859
0
  batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7));
860
0
  batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6));
861
0
  batch->w[4] =
862
0
      aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7));
863
0
  batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7));
864
0
  batch->w[6] = aes_nohw_xor(a6, a4_r4);
865
0
  batch->w[7] = aes_nohw_xor(a7, a5_r5);
866
867
  // Apply the [02 03 01 01] matrix, which is just MixColumns.
868
0
  aes_nohw_mix_columns(batch);
869
0
}
870
871
static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
872
0
                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
873
0
  aes_nohw_add_round_key(batch, &key->keys[0]);
874
0
  for (size_t i = 1; i < num_rounds; i++) {
875
0
    aes_nohw_sub_bytes(batch);
876
0
    aes_nohw_shift_rows(batch);
877
0
    aes_nohw_mix_columns(batch);
878
0
    aes_nohw_add_round_key(batch, &key->keys[i]);
879
0
  }
880
0
  aes_nohw_sub_bytes(batch);
881
0
  aes_nohw_shift_rows(batch);
882
0
  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
883
0
}
884
885
static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key,
886
0
                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
887
0
  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
888
0
  aes_nohw_inv_shift_rows(batch);
889
0
  aes_nohw_inv_sub_bytes(batch);
890
0
  for (size_t i = num_rounds - 1; i > 0; i--) {
891
0
    aes_nohw_add_round_key(batch, &key->keys[i]);
892
0
    aes_nohw_inv_mix_columns(batch);
893
0
    aes_nohw_inv_shift_rows(batch);
894
0
    aes_nohw_inv_sub_bytes(batch);
895
0
  }
896
0
  aes_nohw_add_round_key(batch, &key->keys[0]);
897
0
}
898
899
900
// Key schedule.
901
902
static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
903
0
                                       const AES_KEY *key) {
904
0
  for (size_t i = 0; i <= key->rounds; i++) {
905
    // Copy the round key into each block in the batch.
906
0
    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
907
0
      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
908
0
      memcpy(tmp, key->rd_key + 4 * i, 16);
909
0
      aes_nohw_batch_set(&out->keys[i], tmp, j);
910
0
    }
911
0
    aes_nohw_transpose(&out->keys[i]);
912
0
  }
913
0
}
914
915
static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
916
                                          0x20, 0x40, 0x80, 0x1b, 0x36};
917
918
// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
919
// |rcon|, stored in a |aes_word_t|.
920
0
static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
921
0
  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
922
0
#if defined(OPENSSL_SSE2)
923
0
  return _mm_set_epi32(0, 0, 0, rcon);
924
#else
925
  return ((aes_word_t)rcon);
926
#endif
927
0
}
928
929
static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
930
0
                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
931
0
  AES_NOHW_BATCH batch;
932
0
  memset(&batch, 0, sizeof(batch));
933
0
  aes_nohw_batch_set(&batch, in, 0);
934
0
  aes_nohw_transpose(&batch);
935
0
  aes_nohw_sub_bytes(&batch);
936
0
  aes_nohw_transpose(&batch);
937
0
  aes_nohw_batch_get(&batch, out, 0);
938
0
}
939
940
0
static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
941
0
  key->rounds = 10;
942
943
0
  aes_word_t block[AES_NOHW_BLOCK_WORDS];
944
0
  aes_nohw_compact_block(block, in);
945
0
  memcpy(key->rd_key, block, 16);
946
947
0
  for (size_t i = 1; i <= 10; i++) {
948
0
    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
949
0
    aes_nohw_sub_block(sub, block);
950
0
    uint8_t rcon = aes_nohw_rcon[i - 1];
951
0
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
952
      // Incorporate |rcon| and the transformed word into the first word.
953
0
      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
954
0
      block[j] = aes_nohw_xor(
955
0
          block[j],
956
0
          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
957
      // Propagate to the remaining words. Note this is reordered from the usual
958
      // formulation to avoid needing masks.
959
0
      aes_word_t v = block[j];
960
0
      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
961
0
      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
962
0
      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
963
0
    }
964
0
    memcpy(key->rd_key + 4 * i, block, 16);
965
0
  }
966
0
}
967
968
0
static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) {
969
0
  key->rounds = 12;
970
971
0
  aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS];
972
0
  aes_word_t *block1 = storage1, *block2 = storage2;
973
974
  // AES-192's key schedule is complex because each key schedule iteration
975
  // produces six words, but we compute on blocks and each block is four words.
976
  // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time.
977
  // We loop below every three blocks or two key schedule iterations.
978
  //
979
  // On entry to the loop, |block1| and the first half of |block2| contain the
980
  // previous key schedule iteration. |block1| has been written to |key|, but
981
  // |block2| has not as it is incomplete.
982
0
  aes_nohw_compact_block(block1, in);
983
0
  memcpy(key->rd_key, block1, 16);
984
985
0
  uint8_t half_block[16] = {0};
986
0
  memcpy(half_block, in + 16, 8);
987
0
  aes_nohw_compact_block(block2, half_block);
988
989
0
  for (size_t i = 0; i < 4; i++) {
990
0
    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
991
0
    aes_nohw_sub_block(sub, block2);
992
0
    uint8_t rcon = aes_nohw_rcon[2 * i];
993
0
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
994
      // Compute the first two words of the next key schedule iteration, which
995
      // go in the second half of |block2|. The first two words of the previous
996
      // iteration are in the first half of |block1|. Apply |rcon| here too
997
      // because the shifts match.
998
0
      block2[j] = aes_nohw_or(
999
0
          block2[j],
1000
0
          aes_nohw_shift_left(
1001
0
              aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8));
1002
      // Incorporate the transformed word and propagate. Note the last word of
1003
      // the previous iteration corresponds to the second word of |copy|. This
1004
      // is incorporated into the first word of the next iteration, or the third
1005
      // word of |block2|.
1006
0
      block2[j] = aes_nohw_xor(
1007
0
          block2[j], aes_nohw_and(aes_nohw_shift_left(
1008
0
                                      aes_nohw_rotate_rows_down(sub[j]), 4),
1009
0
                                  AES_NOHW_COL2_MASK));
1010
0
      block2[j] = aes_nohw_xor(
1011
0
          block2[j],
1012
0
          aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK));
1013
1014
      // Compute the remaining four words, which fill |block1|. Begin by moving
1015
      // the corresponding words of the previous iteration: the second half of
1016
      // |block1| and the first half of |block2|.
1017
0
      block1[j] = aes_nohw_shift_right(block1[j], 8);
1018
0
      block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8));
1019
      // Incorporate the second word, computed previously in |block2|, and
1020
      // propagate.
1021
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1022
0
      aes_word_t v = block1[j];
1023
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1024
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1025
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1026
0
    }
1027
1028
    // This completes two round keys. Note half of |block2| was computed in the
1029
    // previous loop iteration but was not yet output.
1030
0
    memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16);
1031
0
    memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16);
1032
1033
0
    aes_nohw_sub_block(sub, block1);
1034
0
    rcon = aes_nohw_rcon[2 * i + 1];
1035
0
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1036
      // Compute the first four words of the next key schedule iteration in
1037
      // |block2|. Begin by moving the corresponding words of the previous
1038
      // iteration: the second half of |block2| and the first half of |block1|.
1039
0
      block2[j] = aes_nohw_shift_right(block2[j], 8);
1040
0
      block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8));
1041
      // Incorporate rcon and the transformed word. Note the last word of the
1042
      // previous iteration corresponds to the last word of |copy|.
1043
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j));
1044
0
      block2[j] = aes_nohw_xor(
1045
0
          block2[j],
1046
0
          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
1047
      // Propagate to the remaining words.
1048
0
      aes_word_t v = block2[j];
1049
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1050
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1051
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1052
1053
      // Compute the last two words, which go in the first half of |block1|. The
1054
      // last two words of the previous iteration are in the second half of
1055
      // |block1|.
1056
0
      block1[j] = aes_nohw_shift_right(block1[j], 8);
1057
      // Propagate blocks and mask off the excess.
1058
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1059
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4));
1060
0
      block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK);
1061
0
    }
1062
1063
    // |block2| has a complete round key. |block1| will be completed in the next
1064
    // iteration.
1065
0
    memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16);
1066
1067
    // Swap blocks to restore the invariant.
1068
0
    aes_word_t *tmp = block1;
1069
0
    block1 = block2;
1070
0
    block2 = tmp;
1071
0
  }
1072
0
}
1073
1074
0
static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
1075
0
  key->rounds = 14;
1076
1077
  // Each key schedule iteration produces two round keys.
1078
0
  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
1079
0
  aes_nohw_compact_block(block1, in);
1080
0
  memcpy(key->rd_key, block1, 16);
1081
1082
0
  aes_nohw_compact_block(block2, in + 16);
1083
0
  memcpy(key->rd_key + 4, block2, 16);
1084
1085
0
  for (size_t i = 2; i <= 14; i += 2) {
1086
0
    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
1087
0
    aes_nohw_sub_block(sub, block2);
1088
0
    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
1089
0
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1090
      // Incorporate |rcon| and the transformed word into the first word.
1091
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
1092
0
      block1[j] = aes_nohw_xor(
1093
0
          block1[j],
1094
0
          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
1095
      // Propagate to the remaining words.
1096
0
      aes_word_t v = block1[j];
1097
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1098
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1099
0
      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1100
0
    }
1101
0
    memcpy(key->rd_key + 4 * i, block1, 16);
1102
1103
0
    if (i == 14) {
1104
0
      break;
1105
0
    }
1106
1107
0
    aes_nohw_sub_block(sub, block1);
1108
0
    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1109
      // Incorporate the transformed word into the first word.
1110
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
1111
      // Propagate to the remaining words.
1112
0
      aes_word_t v = block2[j];
1113
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1114
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1115
0
      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1116
0
    }
1117
0
    memcpy(key->rd_key + 4 * (i + 1), block2, 16);
1118
0
  }
1119
0
}
1120
1121
1122
// External API.
1123
1124
int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
1125
0
                             AES_KEY *aeskey) {
1126
0
  switch (bits) {
1127
0
    case 128:
1128
0
      aes_nohw_setup_key_128(aeskey, key);
1129
0
      return 0;
1130
0
    case 192:
1131
0
      aes_nohw_setup_key_192(aeskey, key);
1132
0
      return 0;
1133
0
    case 256:
1134
0
      aes_nohw_setup_key_256(aeskey, key);
1135
0
      return 0;
1136
0
  }
1137
0
  return 1;
1138
0
}
1139
1140
int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
1141
0
                             AES_KEY *aeskey) {
1142
0
  return aes_nohw_set_encrypt_key(key, bits, aeskey);
1143
0
}
1144
1145
0
void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
1146
0
  AES_NOHW_SCHEDULE sched;
1147
0
  aes_nohw_expand_round_keys(&sched, key);
1148
0
  AES_NOHW_BATCH batch;
1149
0
  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1150
0
  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1151
0
  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1152
0
}
1153
1154
0
void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
1155
0
  AES_NOHW_SCHEDULE sched;
1156
0
  aes_nohw_expand_round_keys(&sched, key);
1157
0
  AES_NOHW_BATCH batch;
1158
0
  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1159
0
  aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1160
0
  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1161
0
}
1162
1163
static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
1164
0
                                      const uint8_t b[16]) {
1165
0
  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
1166
0
    aes_word_t x, y;
1167
0
    memcpy(&x, a + i, sizeof(aes_word_t));
1168
0
    memcpy(&y, b + i, sizeof(aes_word_t));
1169
0
    x = aes_nohw_xor(x, y);
1170
0
    memcpy(out + i, &x, sizeof(aes_word_t));
1171
0
  }
1172
0
}
1173
1174
void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
1175
                                   size_t blocks, const AES_KEY *key,
1176
0
                                   const uint8_t ivec[16]) {
1177
0
  if (blocks == 0) {
1178
0
    return;
1179
0
  }
1180
1181
0
  AES_NOHW_SCHEDULE sched;
1182
0
  aes_nohw_expand_round_keys(&sched, key);
1183
1184
  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
1185
0
  alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
1186
0
  alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
1187
0
  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1188
0
    memcpy(ivs + 16 * i, ivec, 16);
1189
0
  }
1190
1191
0
  uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
1192
0
  for (;;) {
1193
    // Update counters.
1194
0
    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1195
0
      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
1196
0
    }
1197
1198
0
    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1199
0
    AES_NOHW_BATCH batch;
1200
0
    aes_nohw_to_batch(&batch, ivs, todo);
1201
0
    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1202
0
    aes_nohw_from_batch(enc_ivs, todo, &batch);
1203
1204
0
    for (size_t i = 0; i < todo; i++) {
1205
0
      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
1206
0
    }
1207
1208
0
    blocks -= todo;
1209
0
    if (blocks == 0) {
1210
0
      break;
1211
0
    }
1212
1213
0
    in += 16 * AES_NOHW_BATCH_SIZE;
1214
0
    out += 16 * AES_NOHW_BATCH_SIZE;
1215
0
    ctr += AES_NOHW_BATCH_SIZE;
1216
0
  }
1217
0
}
1218
1219
void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
1220
0
                          const AES_KEY *key, uint8_t *ivec, const int enc) {
1221
0
  assert(len % 16 == 0);
1222
0
  size_t blocks = len / 16;
1223
0
  if (blocks == 0) {
1224
0
    return;
1225
0
  }
1226
1227
0
  AES_NOHW_SCHEDULE sched;
1228
0
  aes_nohw_expand_round_keys(&sched, key);
1229
0
  alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16];
1230
0
  memcpy(iv, ivec, 16);
1231
1232
0
  if (enc) {
1233
    // CBC encryption is not parallelizable.
1234
0
    while (blocks > 0) {
1235
0
      aes_nohw_xor_block(iv, iv, in);
1236
1237
0
      AES_NOHW_BATCH batch;
1238
0
      aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1);
1239
0
      aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1240
0
      aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1241
1242
0
      memcpy(iv, out, 16);
1243
1244
0
      in += 16;
1245
0
      out += 16;
1246
0
      blocks--;
1247
0
    }
1248
0
    memcpy(ivec, iv, 16);
1249
0
    return;
1250
0
  }
1251
1252
0
  for (;;) {
1253
0
    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1254
    // Make a copy of the input so we can decrypt in-place.
1255
0
    alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16];
1256
0
    memcpy(copy, in, todo * 16);
1257
1258
0
    AES_NOHW_BATCH batch;
1259
0
    aes_nohw_to_batch(&batch, in, todo);
1260
0
    aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1261
0
    aes_nohw_from_batch(out, todo, &batch);
1262
1263
0
    aes_nohw_xor_block(out, out, iv);
1264
0
    for (size_t i = 1; i < todo; i++) {
1265
0
      aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1));
1266
0
    }
1267
1268
    // Save the last block as the IV.
1269
0
    memcpy(iv, copy + 16 * (todo - 1), 16);
1270
1271
0
    blocks -= todo;
1272
0
    if (blocks == 0) {
1273
0
      break;
1274
0
    }
1275
1276
0
    in += 16 * AES_NOHW_BATCH_SIZE;
1277
0
    out += 16 * AES_NOHW_BATCH_SIZE;
1278
0
  }
1279
1280
0
  memcpy(ivec, iv, 16);
1281
0
}