Coverage Report

Created: 2025-08-25 07:28

/src/liboqs/src/common/aes/aes256_ni.c
Line
Count
Source (jump to first uncovered line)
1
// SPDX-License-Identifier: Public domain
2
// Based on public domain code by Romain Dolbeau
3
// http://dolbeau.name/dolbeau/crypto/crypto.html
4
5
6
#include <assert.h>
7
#include <stdint.h>
8
#include <stdio.h>
9
#include <string.h>
10
#include <oqs/common.h>
11
12
#include <wmmintrin.h>
13
#include <tmmintrin.h>
14
15
#define AES_BLOCKBYTES 16
16
17
typedef struct {
18
  __m128i sk_exp[15];
19
  __m128i iv;
20
} aes256ctx;
21
22
#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0))
23
24
// From crypto_core/aes256encrypt/dolbeau/aesenc-int
25
0
static inline void aes256ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[15]) {
26
0
  __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0));
27
0
  __m128i key1 = _mm_loadu_si128((const __m128i *)(key + 16));
28
0
  __m128i temp0, temp1, temp2, temp4;
29
0
  int idx = 0;
30
31
0
  rkeys[idx++] = key0;
32
0
  temp0 = key0;
33
0
  temp2 = key1;
34
35
  /* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */
36
0
#define BLOCK1(IMM)                                \
37
0
    temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
38
0
    rkeys[idx++] = temp2;                          \
39
0
    temp4 = _mm_slli_si128(temp0,4);               \
40
0
    temp0 = _mm_xor_si128(temp0,temp4);            \
41
0
    temp4 = _mm_slli_si128(temp0,8);               \
42
0
    temp0 = _mm_xor_si128(temp0,temp4);            \
43
0
    temp1 = _mm_shuffle_epi32(temp1,0xff);         \
44
0
    temp0 = _mm_xor_si128(temp0,temp1)
45
46
0
#define BLOCK2(IMM)                                \
47
0
    temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
48
0
    rkeys[idx++] = temp0;                          \
49
0
    temp4 = _mm_slli_si128(temp2,4);               \
50
0
    temp2 = _mm_xor_si128(temp2,temp4);            \
51
0
    temp4 = _mm_slli_si128(temp2,8);               \
52
0
    temp2 = _mm_xor_si128(temp2,temp4);            \
53
0
    temp1 = _mm_shuffle_epi32(temp1,0xaa);         \
54
0
    temp2 = _mm_xor_si128(temp2,temp1)
55
56
0
  BLOCK1(0x01);
57
0
  BLOCK2(0x01);
58
59
0
  BLOCK1(0x02);
60
0
  BLOCK2(0x02);
61
62
0
  BLOCK1(0x04);
63
0
  BLOCK2(0x04);
64
65
0
  BLOCK1(0x08);
66
0
  BLOCK2(0x08);
67
68
0
  BLOCK1(0x10);
69
0
  BLOCK2(0x10);
70
71
0
  BLOCK1(0x20);
72
0
  BLOCK2(0x20);
73
74
0
  BLOCK1(0x40);
75
0
  rkeys[idx++] = temp0;
76
0
}
77
78
0
void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule) {
79
0
  *_schedule = OQS_MEM_malloc(sizeof(aes256ctx));
80
0
  OQS_EXIT_IF_NULLPTR(*_schedule, "AES");
81
0
  assert(*_schedule != NULL);
82
0
  __m128i *schedule = ((aes256ctx *) *_schedule)->sk_exp;
83
0
  aes256ni_setkey_encrypt(key, schedule);
84
0
}
85
86
0
void oqs_aes256_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule) {
87
0
  aes256ctx *ctx = _schedule;
88
0
  __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
89
0
  if (iv_len == 12) {
90
0
    const int32_t *ivi = (const int32_t *) iv;
91
0
    ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx);
92
0
  } else if (iv_len == 16) {
93
0
    ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx);
94
0
  } else {
95
0
    exit(EXIT_FAILURE);
96
0
  }
97
0
}
98
99
0
void oqs_aes256_load_iv_u64_ni(uint64_t iv, void *_schedule) {
100
0
  aes256ctx *ctx = _schedule;
101
0
  ctx->iv = _mm_loadl_epi64((__m128i *)&iv);
102
0
}
103
104
0
void oqs_aes256_free_schedule_ni(void *schedule) {
105
0
  if (schedule != NULL) {
106
0
    OQS_MEM_secure_free(schedule, sizeof(aes256ctx));
107
0
  }
108
0
}
109
110
111
// Modified from crypto_core/aes256encrypt/dolbeau/aesenc-int
112
0
static inline void aes256ni_encrypt(const __m128i rkeys[15], __m128i nv, unsigned char *out) {
113
0
  __m128i temp = _mm_xor_si128(nv, rkeys[0]);
114
0
  temp = _mm_aesenc_si128(temp, rkeys[1]);
115
0
  temp = _mm_aesenc_si128(temp, rkeys[2]);
116
0
  temp = _mm_aesenc_si128(temp, rkeys[3]);
117
0
  temp = _mm_aesenc_si128(temp, rkeys[4]);
118
0
  temp = _mm_aesenc_si128(temp, rkeys[5]);
119
0
  temp = _mm_aesenc_si128(temp, rkeys[6]);
120
0
  temp = _mm_aesenc_si128(temp, rkeys[7]);
121
0
  temp = _mm_aesenc_si128(temp, rkeys[8]);
122
0
  temp = _mm_aesenc_si128(temp, rkeys[9]);
123
0
  temp = _mm_aesenc_si128(temp, rkeys[10]);
124
0
  temp = _mm_aesenc_si128(temp, rkeys[11]);
125
0
  temp = _mm_aesenc_si128(temp, rkeys[12]);
126
0
  temp = _mm_aesenc_si128(temp, rkeys[13]);
127
0
  temp = _mm_aesenclast_si128(temp, rkeys[14]);
128
0
  _mm_storeu_si128((__m128i *)(out), temp);
129
0
}
130
131
// 4x interleaved encryption
132
0
static inline void aes256ni_encrypt_x4(const __m128i rkeys[15], __m128i n0, __m128i n1, __m128i n2, __m128i n3, unsigned char *out) {
133
0
  __m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
134
0
  __m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
135
0
  __m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
136
0
  __m128i temp3 = _mm_xor_si128(n3, rkeys[0]);
137
138
0
#define AESNENCX4(IDX) \
139
0
    temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \
140
0
    temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \
141
0
    temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \
142
0
    temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])
143
144
0
  AESNENCX4(1);
145
0
  AESNENCX4(2);
146
0
  AESNENCX4(3);
147
0
  AESNENCX4(4);
148
0
  AESNENCX4(5);
149
0
  AESNENCX4(6);
150
0
  AESNENCX4(7);
151
0
  AESNENCX4(8);
152
0
  AESNENCX4(9);
153
0
  AESNENCX4(10);
154
0
  AESNENCX4(11);
155
0
  AESNENCX4(12);
156
0
  AESNENCX4(13);
157
158
0
  temp0 = _mm_aesenclast_si128(temp0, rkeys[14]);
159
0
  temp1 = _mm_aesenclast_si128(temp1, rkeys[14]);
160
0
  temp2 = _mm_aesenclast_si128(temp2, rkeys[14]);
161
0
  temp3 = _mm_aesenclast_si128(temp3, rkeys[14]);
162
163
0
  _mm_storeu_si128((__m128i *)(out + 0), temp0);
164
0
  _mm_storeu_si128((__m128i *)(out + 16), temp1);
165
0
  _mm_storeu_si128((__m128i *)(out + 32), temp2);
166
0
  _mm_storeu_si128((__m128i *)(out + 48), temp3);
167
0
}
168
169
0
void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
170
0
  const __m128i *schedule = ((const aes256ctx *) _schedule)->sk_exp;
171
0
  aes256ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext);
172
0
}
173
174
0
void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
175
0
  assert(plaintext_len % 16 == 0);
176
0
  for (size_t block = 0; block < plaintext_len / 16; block++) {
177
0
    oqs_aes256_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
178
0
  }
179
0
}
180
181
0
void oqs_aes256_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_blks) {
182
0
  aes256ctx *ctx = (aes256ctx *) schedule;
183
0
  const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
184
185
0
  while (out_blks >= 4) {
186
0
    __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
187
0
    __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask);
188
0
    __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask);
189
0
    __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask);
190
0
    aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
191
0
    ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0));
192
0
    out += 64;
193
0
    out_blks -= 4;
194
0
  }
195
0
  while (out_blks >= 1) {
196
0
    __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
197
0
    aes256ni_encrypt(schedule, nv0, out);
198
0
    ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0));
199
0
    out += 16;
200
0
    out_blks--;
201
0
  }
202
0
}
203
204
0
void oqs_aes256_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
205
0
  __m128i block;
206
0
  __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
207
0
  if (iv_len == 12) {
208
0
    const int32_t *ivi = (const int32_t *) iv;
209
0
    block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]);
210
0
  } else if (iv_len == 16) {
211
0
    block = _mm_loadu_si128((const __m128i *)iv);
212
0
  } else {
213
0
    exit(EXIT_FAILURE);
214
0
  }
215
216
0
  while (out_len >= 64) {
217
0
    __m128i nv0 = block;
218
0
    __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
219
0
    __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask);
220
0
    __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask);
221
0
    aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
222
0
    block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask);
223
0
    out += 64;
224
0
    out_len -= 64;
225
0
  }
226
0
  while (out_len >= 16) {
227
0
    aes256ni_encrypt(schedule, block, out);
228
0
    out += 16;
229
0
    out_len -= 16;
230
0
    block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
231
0
  }
232
0
  if (out_len > 0) {
233
0
    uint8_t tmp[16];
234
0
    aes256ni_encrypt(schedule, block, tmp);
235
0
    memcpy(out, tmp, out_len);
236
0
  }
237
0
}