/src/liboqs/src/common/aes/aes256_ni.c
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: Public domain |
2 | | // Based on public domain code by Romain Dolbeau |
3 | | // http://dolbeau.name/dolbeau/crypto/crypto.html |
4 | | |
5 | | |
6 | | #include <assert.h> |
7 | | #include <stdint.h> |
8 | | #include <stdio.h> |
9 | | #include <string.h> |
10 | | #include <oqs/common.h> |
11 | | |
12 | | #include <wmmintrin.h> |
13 | | #include <tmmintrin.h> |
14 | | |
15 | | #define AES_BLOCKBYTES 16 |
16 | | |
17 | | typedef struct { |
18 | | __m128i sk_exp[15]; |
19 | | __m128i iv; |
20 | | } aes256ctx; |
21 | | |
22 | | #define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0)) |
23 | | |
24 | | // From crypto_core/aes256encrypt/dolbeau/aesenc-int |
25 | 0 | static inline void aes256ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[15]) { |
26 | 0 | __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); |
27 | 0 | __m128i key1 = _mm_loadu_si128((const __m128i *)(key + 16)); |
28 | 0 | __m128i temp0, temp1, temp2, temp4; |
29 | 0 | int idx = 0; |
30 | |
|
31 | 0 | rkeys[idx++] = key0; |
32 | 0 | temp0 = key0; |
33 | 0 | temp2 = key1; |
34 | | |
35 | | /* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */ |
36 | 0 | #define BLOCK1(IMM) \ |
37 | 0 | temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \ |
38 | 0 | rkeys[idx++] = temp2; \ |
39 | 0 | temp4 = _mm_slli_si128(temp0,4); \ |
40 | 0 | temp0 = _mm_xor_si128(temp0,temp4); \ |
41 | 0 | temp4 = _mm_slli_si128(temp0,8); \ |
42 | 0 | temp0 = _mm_xor_si128(temp0,temp4); \ |
43 | 0 | temp1 = _mm_shuffle_epi32(temp1,0xff); \ |
44 | 0 | temp0 = _mm_xor_si128(temp0,temp1) |
45 | |
|
46 | 0 | #define BLOCK2(IMM) \ |
47 | 0 | temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ |
48 | 0 | rkeys[idx++] = temp0; \ |
49 | 0 | temp4 = _mm_slli_si128(temp2,4); \ |
50 | 0 | temp2 = _mm_xor_si128(temp2,temp4); \ |
51 | 0 | temp4 = _mm_slli_si128(temp2,8); \ |
52 | 0 | temp2 = _mm_xor_si128(temp2,temp4); \ |
53 | 0 | temp1 = _mm_shuffle_epi32(temp1,0xaa); \ |
54 | 0 | temp2 = _mm_xor_si128(temp2,temp1) |
55 | |
|
56 | 0 | BLOCK1(0x01); |
57 | 0 | BLOCK2(0x01); |
58 | |
|
59 | 0 | BLOCK1(0x02); |
60 | 0 | BLOCK2(0x02); |
61 | |
|
62 | 0 | BLOCK1(0x04); |
63 | 0 | BLOCK2(0x04); |
64 | |
|
65 | 0 | BLOCK1(0x08); |
66 | 0 | BLOCK2(0x08); |
67 | |
|
68 | 0 | BLOCK1(0x10); |
69 | 0 | BLOCK2(0x10); |
70 | |
|
71 | 0 | BLOCK1(0x20); |
72 | 0 | BLOCK2(0x20); |
73 | |
|
74 | 0 | BLOCK1(0x40); |
75 | 0 | rkeys[idx++] = temp0; |
76 | 0 | } |
77 | | |
78 | 0 | void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule) { |
79 | 0 | *_schedule = OQS_MEM_malloc(sizeof(aes256ctx)); |
80 | 0 | OQS_EXIT_IF_NULLPTR(*_schedule, "AES"); |
81 | 0 | assert(*_schedule != NULL); |
82 | 0 | __m128i *schedule = ((aes256ctx *) *_schedule)->sk_exp; |
83 | 0 | aes256ni_setkey_encrypt(key, schedule); |
84 | 0 | } |
85 | | |
86 | 0 | void oqs_aes256_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule) { |
87 | 0 | aes256ctx *ctx = _schedule; |
88 | 0 | __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); |
89 | 0 | if (iv_len == 12) { |
90 | 0 | const int32_t *ivi = (const int32_t *) iv; |
91 | 0 | ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx); |
92 | 0 | } else if (iv_len == 16) { |
93 | 0 | ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx); |
94 | 0 | } else { |
95 | 0 | exit(EXIT_FAILURE); |
96 | 0 | } |
97 | 0 | } |
98 | | |
99 | 0 | void oqs_aes256_load_iv_u64_ni(uint64_t iv, void *_schedule) { |
100 | 0 | aes256ctx *ctx = _schedule; |
101 | 0 | ctx->iv = _mm_loadl_epi64((__m128i *)&iv); |
102 | 0 | } |
103 | | |
104 | 0 | void oqs_aes256_free_schedule_ni(void *schedule) { |
105 | 0 | if (schedule != NULL) { |
106 | 0 | OQS_MEM_secure_free(schedule, sizeof(aes256ctx)); |
107 | 0 | } |
108 | 0 | } |
109 | | |
110 | | |
111 | | // Modified from crypto_core/aes256encrypt/dolbeau/aesenc-int |
112 | 0 | static inline void aes256ni_encrypt(const __m128i rkeys[15], __m128i nv, unsigned char *out) { |
113 | 0 | __m128i temp = _mm_xor_si128(nv, rkeys[0]); |
114 | 0 | temp = _mm_aesenc_si128(temp, rkeys[1]); |
115 | 0 | temp = _mm_aesenc_si128(temp, rkeys[2]); |
116 | 0 | temp = _mm_aesenc_si128(temp, rkeys[3]); |
117 | 0 | temp = _mm_aesenc_si128(temp, rkeys[4]); |
118 | 0 | temp = _mm_aesenc_si128(temp, rkeys[5]); |
119 | 0 | temp = _mm_aesenc_si128(temp, rkeys[6]); |
120 | 0 | temp = _mm_aesenc_si128(temp, rkeys[7]); |
121 | 0 | temp = _mm_aesenc_si128(temp, rkeys[8]); |
122 | 0 | temp = _mm_aesenc_si128(temp, rkeys[9]); |
123 | 0 | temp = _mm_aesenc_si128(temp, rkeys[10]); |
124 | 0 | temp = _mm_aesenc_si128(temp, rkeys[11]); |
125 | 0 | temp = _mm_aesenc_si128(temp, rkeys[12]); |
126 | 0 | temp = _mm_aesenc_si128(temp, rkeys[13]); |
127 | 0 | temp = _mm_aesenclast_si128(temp, rkeys[14]); |
128 | 0 | _mm_storeu_si128((__m128i *)(out), temp); |
129 | 0 | } |
130 | | |
131 | | // 4x interleaved encryption |
132 | 0 | static inline void aes256ni_encrypt_x4(const __m128i rkeys[15], __m128i n0, __m128i n1, __m128i n2, __m128i n3, unsigned char *out) { |
133 | 0 | __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); |
134 | 0 | __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); |
135 | 0 | __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); |
136 | 0 | __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); |
137 | |
|
138 | 0 | #define AESNENCX4(IDX) \ |
139 | 0 | temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ |
140 | 0 | temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ |
141 | 0 | temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ |
142 | 0 | temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) |
143 | |
|
144 | 0 | AESNENCX4(1); |
145 | 0 | AESNENCX4(2); |
146 | 0 | AESNENCX4(3); |
147 | 0 | AESNENCX4(4); |
148 | 0 | AESNENCX4(5); |
149 | 0 | AESNENCX4(6); |
150 | 0 | AESNENCX4(7); |
151 | 0 | AESNENCX4(8); |
152 | 0 | AESNENCX4(9); |
153 | 0 | AESNENCX4(10); |
154 | 0 | AESNENCX4(11); |
155 | 0 | AESNENCX4(12); |
156 | 0 | AESNENCX4(13); |
157 | |
|
158 | 0 | temp0 = _mm_aesenclast_si128(temp0, rkeys[14]); |
159 | 0 | temp1 = _mm_aesenclast_si128(temp1, rkeys[14]); |
160 | 0 | temp2 = _mm_aesenclast_si128(temp2, rkeys[14]); |
161 | 0 | temp3 = _mm_aesenclast_si128(temp3, rkeys[14]); |
162 | |
|
163 | 0 | _mm_storeu_si128((__m128i *)(out + 0), temp0); |
164 | 0 | _mm_storeu_si128((__m128i *)(out + 16), temp1); |
165 | 0 | _mm_storeu_si128((__m128i *)(out + 32), temp2); |
166 | 0 | _mm_storeu_si128((__m128i *)(out + 48), temp3); |
167 | 0 | } |
168 | | |
169 | 0 | void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) { |
170 | 0 | const __m128i *schedule = ((const aes256ctx *) _schedule)->sk_exp; |
171 | 0 | aes256ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext); |
172 | 0 | } |
173 | | |
174 | 0 | void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { |
175 | 0 | assert(plaintext_len % 16 == 0); |
176 | 0 | for (size_t block = 0; block < plaintext_len / 16; block++) { |
177 | 0 | oqs_aes256_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block)); |
178 | 0 | } |
179 | 0 | } |
180 | | |
181 | 0 | void oqs_aes256_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_blks) { |
182 | 0 | aes256ctx *ctx = (aes256ctx *) schedule; |
183 | 0 | const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); |
184 | |
|
185 | 0 | while (out_blks >= 4) { |
186 | 0 | __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask); |
187 | 0 | __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask); |
188 | 0 | __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask); |
189 | 0 | __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask); |
190 | 0 | aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); |
191 | 0 | ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0)); |
192 | 0 | out += 64; |
193 | 0 | out_blks -= 4; |
194 | 0 | } |
195 | 0 | while (out_blks >= 1) { |
196 | 0 | __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask); |
197 | 0 | aes256ni_encrypt(schedule, nv0, out); |
198 | 0 | ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)); |
199 | 0 | out += 16; |
200 | 0 | out_blks--; |
201 | 0 | } |
202 | 0 | } |
203 | | |
204 | 0 | void oqs_aes256_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { |
205 | 0 | __m128i block; |
206 | 0 | __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); |
207 | 0 | if (iv_len == 12) { |
208 | 0 | const int32_t *ivi = (const int32_t *) iv; |
209 | 0 | block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]); |
210 | 0 | } else if (iv_len == 16) { |
211 | 0 | block = _mm_loadu_si128((const __m128i *)iv); |
212 | 0 | } else { |
213 | 0 | exit(EXIT_FAILURE); |
214 | 0 | } |
215 | | |
216 | 0 | while (out_len >= 64) { |
217 | 0 | __m128i nv0 = block; |
218 | 0 | __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask); |
219 | 0 | __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask); |
220 | 0 | __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask); |
221 | 0 | aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); |
222 | 0 | block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask); |
223 | 0 | out += 64; |
224 | 0 | out_len -= 64; |
225 | 0 | } |
226 | 0 | while (out_len >= 16) { |
227 | 0 | aes256ni_encrypt(schedule, block, out); |
228 | 0 | out += 16; |
229 | 0 | out_len -= 16; |
230 | 0 | block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask); |
231 | 0 | } |
232 | 0 | if (out_len > 0) { |
233 | 0 | uint8_t tmp[16]; |
234 | 0 | aes256ni_encrypt(schedule, block, tmp); |
235 | 0 | memcpy(out, tmp, out_len); |
236 | 0 | } |
237 | 0 | } |