/src/liboqs/src/common/aes/aes256_ni.c

Source (jump to first uncovered line)
// SPDX-License-Identifier: Public domain
// Based on public domain code by Romain Dolbeau
// http://dolbeau.name/dolbeau/crypto/crypto.html


#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <oqs/common.h>

#include <wmmintrin.h>
#include <tmmintrin.h>

#define AES_BLOCKBYTES 16

typedef struct {
  __m128i sk_exp[15];
  __m128i iv;
} aes256ctx;

#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0))

// From crypto_core/aes256encrypt/dolbeau/aesenc-int
static inline void aes256ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[15]) {
  __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0));
  __m128i key1 = _mm_loadu_si128((const __m128i *)(key + 16));
  __m128i temp0, temp1, temp2, temp4;
  int idx = 0;

  rkeys[idx++] = key0;
  temp0 = key0;
  temp2 = key1;

  /* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */
#define BLOCK1(IMM)                                \
    temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
    rkeys[idx++] = temp2;                          \
    temp4 = _mm_slli_si128(temp0,4);               \
    temp0 = _mm_xor_si128(temp0,temp4);            \
    temp4 = _mm_slli_si128(temp0,8);               \
    temp0 = _mm_xor_si128(temp0,temp4);            \
    temp1 = _mm_shuffle_epi32(temp1,0xff);         \
    temp0 = _mm_xor_si128(temp0,temp1)

#define BLOCK2(IMM)                                \
    temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
    rkeys[idx++] = temp0;                          \
    temp4 = _mm_slli_si128(temp2,4);               \
    temp2 = _mm_xor_si128(temp2,temp4);            \
    temp4 = _mm_slli_si128(temp2,8);               \
    temp2 = _mm_xor_si128(temp2,temp4);            \
    temp1 = _mm_shuffle_epi32(temp1,0xaa);         \
    temp2 = _mm_xor_si128(temp2,temp1)

  BLOCK1(0x01);
  BLOCK2(0x01);

  BLOCK1(0x02);
  BLOCK2(0x02);

  BLOCK1(0x04);
  BLOCK2(0x04);

  BLOCK1(0x08);
  BLOCK2(0x08);

  BLOCK1(0x10);
  BLOCK2(0x10);

  BLOCK1(0x20);
  BLOCK2(0x20);

  BLOCK1(0x40);
  rkeys[idx++] = temp0;
}

void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule) {
  *_schedule = OQS_MEM_malloc(sizeof(aes256ctx));
  OQS_EXIT_IF_NULLPTR(*_schedule, "AES");
  assert(*_schedule != NULL);
  __m128i *schedule = ((aes256ctx *) *_schedule)->sk_exp;
  aes256ni_setkey_encrypt(key, schedule);
}

void oqs_aes256_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule) {
  aes256ctx *ctx = _schedule;
  __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
  if (iv_len == 12) {
    const int32_t *ivi = (const int32_t *) iv;
    ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx);
  } else if (iv_len == 16) {
    ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx);
  } else {
    exit(EXIT_FAILURE);
  }
}

void oqs_aes256_load_iv_u64_ni(uint64_t iv, void *_schedule) {
  aes256ctx *ctx = _schedule;
  ctx->iv = _mm_loadl_epi64((__m128i *)&iv);
}

void oqs_aes256_free_schedule_ni(void *schedule) {
  if (schedule != NULL) {
    OQS_MEM_secure_free(schedule, sizeof(aes256ctx));
  }
}


// Modified from crypto_core/aes256encrypt/dolbeau/aesenc-int
static inline void aes256ni_encrypt(const __m128i rkeys[15], __m128i nv, unsigned char *out) {
  __m128i temp = _mm_xor_si128(nv, rkeys[0]);
  temp = _mm_aesenc_si128(temp, rkeys[1]);
  temp = _mm_aesenc_si128(temp, rkeys[2]);
  temp = _mm_aesenc_si128(temp, rkeys[3]);
  temp = _mm_aesenc_si128(temp, rkeys[4]);
  temp = _mm_aesenc_si128(temp, rkeys[5]);
  temp = _mm_aesenc_si128(temp, rkeys[6]);
  temp = _mm_aesenc_si128(temp, rkeys[7]);
  temp = _mm_aesenc_si128(temp, rkeys[8]);
  temp = _mm_aesenc_si128(temp, rkeys[9]);
  temp = _mm_aesenc_si128(temp, rkeys[10]);
  temp = _mm_aesenc_si128(temp, rkeys[11]);
  temp = _mm_aesenc_si128(temp, rkeys[12]);
  temp = _mm_aesenc_si128(temp, rkeys[13]);
  temp = _mm_aesenclast_si128(temp, rkeys[14]);
  _mm_storeu_si128((__m128i *)(out), temp);
}

// 4x interleaved encryption
static inline void aes256ni_encrypt_x4(const __m128i rkeys[15], __m128i n0, __m128i n1, __m128i n2, __m128i n3, unsigned char *out) {
  __m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
  __m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
  __m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
  __m128i temp3 = _mm_xor_si128(n3, rkeys[0]);

#define AESNENCX4(IDX) \
    temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \
    temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \
    temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \
    temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])

  AESNENCX4(1);
  AESNENCX4(2);
  AESNENCX4(3);
  AESNENCX4(4);
  AESNENCX4(5);
  AESNENCX4(6);
  AESNENCX4(7);
  AESNENCX4(8);
  AESNENCX4(9);
  AESNENCX4(10);
  AESNENCX4(11);
  AESNENCX4(12);
  AESNENCX4(13);

  temp0 = _mm_aesenclast_si128(temp0, rkeys[14]);
  temp1 = _mm_aesenclast_si128(temp1, rkeys[14]);
  temp2 = _mm_aesenclast_si128(temp2, rkeys[14]);
  temp3 = _mm_aesenclast_si128(temp3, rkeys[14]);

  _mm_storeu_si128((__m128i *)(out + 0), temp0);
  _mm_storeu_si128((__m128i *)(out + 16), temp1);
  _mm_storeu_si128((__m128i *)(out + 32), temp2);
  _mm_storeu_si128((__m128i *)(out + 48), temp3);
}

void oqs_aes256_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
  const __m128i *schedule = ((const aes256ctx *) _schedule)->sk_exp;
  aes256ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext);
}

void oqs_aes256_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
  assert(plaintext_len % 16 == 0);
  for (size_t block = 0; block < plaintext_len / 16; block++) {
    oqs_aes256_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
  }
}

void oqs_aes256_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_blks) {
  aes256ctx *ctx = (aes256ctx *) schedule;
  const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);

  while (out_blks >= 4) {
    __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
    __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask);
    __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask);
    __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask);
    aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
    ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0));
    out += 64;
    out_blks -= 4;
  }
  while (out_blks >= 1) {
    __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
    aes256ni_encrypt(schedule, nv0, out);
    ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0));
    out += 16;
    out_blks--;
  }
}

void oqs_aes256_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
  __m128i block;
  __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
  if (iv_len == 12) {
    const int32_t *ivi = (const int32_t *) iv;
    block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]);
  } else if (iv_len == 16) {
    block = _mm_loadu_si128((const __m128i *)iv);
  } else {
    exit(EXIT_FAILURE);
  }

  while (out_len >= 64) {
    __m128i nv0 = block;
    __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
    __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask);
    __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask);
    aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
    block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask);
    out += 64;
    out_len -= 64;
  }
  while (out_len >= 16) {
    aes256ni_encrypt(schedule, block, out);
    out += 16;
    out_len -= 16;
    block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
  }
  if (out_len > 0) {
    uint8_t tmp[16];
    aes256ni_encrypt(schedule, block, tmp);
    memcpy(out, tmp, out_len);
  }
}

Coverage Report

Created: 2025-08-25 07:28

Line	Count	Source (jump to first uncovered line)
1		// SPDX-License-Identifier: Public domain
2		// Based on public domain code by Romain Dolbeau
3		// http://dolbeau.name/dolbeau/crypto/crypto.html
4
5
6		#include <assert.h>
7		#include <stdint.h>
8		#include <stdio.h>
9		#include <string.h>
10		#include <oqs/common.h>
11
12		#include <wmmintrin.h>
13		#include <tmmintrin.h>
14
15		#define AES_BLOCKBYTES 16
16
17		typedef struct {
18		__m128i sk_exp[15];
19		__m128i iv;
20		} aes256ctx;
21
22		#define BE_TO_UINT32(n) (uint32_t)((((uint8_t ) &(n))[0] << 24) \| (((uint8_t ) &(n))[1] << 16) \| (((uint8_t ) &(n))[2] << 8) \| (((uint8_t ) &(n))[3] << 0))
23
24		// From crypto_core/aes256encrypt/dolbeau/aesenc-int
25	0	static inline void aes256ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[15]) {
26	0	__m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0));
27	0	__m128i key1 = _mm_loadu_si128((const __m128i *)(key + 16));
28	0	__m128i temp0, temp1, temp2, temp4;
29	0	int idx = 0;
30
31	0	rkeys[idx++] = key0;
32	0	temp0 = key0;
33	0	temp2 = key1;
34
35		/* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */
36	0	#define BLOCK1(IMM) \
37	0	temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
38	0	rkeys[idx++] = temp2; \
39	0	temp4 = _mm_slli_si128(temp0,4); \
40	0	temp0 = _mm_xor_si128(temp0,temp4); \
41	0	temp4 = _mm_slli_si128(temp0,8); \
42	0	temp0 = _mm_xor_si128(temp0,temp4); \
43	0	temp1 = _mm_shuffle_epi32(temp1,0xff); \
44	0	temp0 = _mm_xor_si128(temp0,temp1)
45
46	0	#define BLOCK2(IMM) \
47	0	temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
48	0	rkeys[idx++] = temp0; \
49	0	temp4 = _mm_slli_si128(temp2,4); \
50	0	temp2 = _mm_xor_si128(temp2,temp4); \
51	0	temp4 = _mm_slli_si128(temp2,8); \
52	0	temp2 = _mm_xor_si128(temp2,temp4); \
53	0	temp1 = _mm_shuffle_epi32(temp1,0xaa); \
54	0	temp2 = _mm_xor_si128(temp2,temp1)
55
56	0	BLOCK1(0x01);
57	0	BLOCK2(0x01);
58
59	0	BLOCK1(0x02);
60	0	BLOCK2(0x02);
61
62	0	BLOCK1(0x04);
63	0	BLOCK2(0x04);
64
65	0	BLOCK1(0x08);
66	0	BLOCK2(0x08);
67
68	0	BLOCK1(0x10);
69	0	BLOCK2(0x10);
70
71	0	BLOCK1(0x20);
72	0	BLOCK2(0x20);
73
74	0	BLOCK1(0x40);
75	0	rkeys[idx++] = temp0;
76	0	}
77
78	0	void oqs_aes256_load_schedule_ni(const uint8_t key, void *_schedule) {
79	0	*_schedule = OQS_MEM_malloc(sizeof(aes256ctx));
80	0	OQS_EXIT_IF_NULLPTR(*_schedule, "AES");
81	0	assert(*_schedule != NULL);
82	0	__m128i schedule = ((aes256ctx ) *_schedule)->sk_exp;
83	0	aes256ni_setkey_encrypt(key, schedule);
84	0	}
85
86	0	void oqs_aes256_load_iv_ni(const uint8_t iv, size_t iv_len, void _schedule) {
87	0	aes256ctx *ctx = _schedule;
88	0	__m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
89	0	if (iv_len == 12) {
90	0	const int32_t ivi = (const int32_t ) iv;
91	0	ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx);
92	0	} else if (iv_len == 16) {
93	0	ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx);
94	0	} else {
95	0	exit(EXIT_FAILURE);
96	0	}
97	0	}
98
99	0	void oqs_aes256_load_iv_u64_ni(uint64_t iv, void *_schedule) {
100	0	aes256ctx *ctx = _schedule;
101	0	ctx->iv = _mm_loadl_epi64((__m128i *)&iv);
102	0	}
103
104	0	void oqs_aes256_free_schedule_ni(void *schedule) {
105	0	if (schedule != NULL) {
106	0	OQS_MEM_secure_free(schedule, sizeof(aes256ctx));
107	0	}
108	0	}
109
110
111		// Modified from crypto_core/aes256encrypt/dolbeau/aesenc-int
112	0	static inline void aes256ni_encrypt(const __m128i rkeys[15], __m128i nv, unsigned char *out) {
113	0	__m128i temp = _mm_xor_si128(nv, rkeys[0]);
114	0	temp = _mm_aesenc_si128(temp, rkeys[1]);
115	0	temp = _mm_aesenc_si128(temp, rkeys[2]);
116	0	temp = _mm_aesenc_si128(temp, rkeys[3]);
117	0	temp = _mm_aesenc_si128(temp, rkeys[4]);
118	0	temp = _mm_aesenc_si128(temp, rkeys[5]);
119	0	temp = _mm_aesenc_si128(temp, rkeys[6]);
120	0	temp = _mm_aesenc_si128(temp, rkeys[7]);
121	0	temp = _mm_aesenc_si128(temp, rkeys[8]);
122	0	temp = _mm_aesenc_si128(temp, rkeys[9]);
123	0	temp = _mm_aesenc_si128(temp, rkeys[10]);
124	0	temp = _mm_aesenc_si128(temp, rkeys[11]);
125	0	temp = _mm_aesenc_si128(temp, rkeys[12]);
126	0	temp = _mm_aesenc_si128(temp, rkeys[13]);
127	0	temp = _mm_aesenclast_si128(temp, rkeys[14]);
128	0	_mm_storeu_si128((__m128i *)(out), temp);
129	0	}
130
131		// 4x interleaved encryption
132	0	static inline void aes256ni_encrypt_x4(const __m128i rkeys[15], __m128i n0, __m128i n1, __m128i n2, __m128i n3, unsigned char *out) {
133	0	__m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
134	0	__m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
135	0	__m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
136	0	__m128i temp3 = _mm_xor_si128(n3, rkeys[0]);
137
138	0	#define AESNENCX4(IDX) \
139	0	temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \
140	0	temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \
141	0	temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \
142	0	temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])
143
144	0	AESNENCX4(1);
145	0	AESNENCX4(2);
146	0	AESNENCX4(3);
147	0	AESNENCX4(4);
148	0	AESNENCX4(5);
149	0	AESNENCX4(6);
150	0	AESNENCX4(7);
151	0	AESNENCX4(8);
152	0	AESNENCX4(9);
153	0	AESNENCX4(10);
154	0	AESNENCX4(11);
155	0	AESNENCX4(12);
156	0	AESNENCX4(13);
157
158	0	temp0 = _mm_aesenclast_si128(temp0, rkeys[14]);
159	0	temp1 = _mm_aesenclast_si128(temp1, rkeys[14]);
160	0	temp2 = _mm_aesenclast_si128(temp2, rkeys[14]);
161	0	temp3 = _mm_aesenclast_si128(temp3, rkeys[14]);
162
163	0	_mm_storeu_si128((__m128i *)(out + 0), temp0);
164	0	_mm_storeu_si128((__m128i *)(out + 16), temp1);
165	0	_mm_storeu_si128((__m128i *)(out + 32), temp2);
166	0	_mm_storeu_si128((__m128i *)(out + 48), temp3);
167	0	}
168
169	0	void oqs_aes256_enc_sch_block_ni(const uint8_t plaintext, const void _schedule, uint8_t *ciphertext) {
170	0	const __m128i schedule = ((const aes256ctx ) _schedule)->sk_exp;
171	0	aes256ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext);
172	0	}
173
174	0	void oqs_aes256_ecb_enc_sch_ni(const uint8_t plaintext, const size_t plaintext_len, const void schedule, uint8_t *ciphertext) {
175	0	assert(plaintext_len % 16 == 0);
176	0	for (size_t block = 0; block < plaintext_len / 16; block++) {
177	0	oqs_aes256_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
178	0	}
179	0	}
180
181	0	void oqs_aes256_ctr_enc_sch_upd_blks_ni(void schedule, uint8_t out, size_t out_blks) {
182	0	aes256ctx ctx = (aes256ctx ) schedule;
183	0	const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
184
185	0	while (out_blks >= 4) {
186	0	__m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
187	0	__m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask);
188	0	__m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask);
189	0	__m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask);
190	0	aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
191	0	ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0));
192	0	out += 64;
193	0	out_blks -= 4;
194	0	}
195	0	while (out_blks >= 1) {
196	0	__m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
197	0	aes256ni_encrypt(schedule, nv0, out);
198	0	ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0));
199	0	out += 16;
200	0	out_blks--;
201	0	}
202	0	}
203
204	0	void oqs_aes256_ctr_enc_sch_ni(const uint8_t iv, const size_t iv_len, const void schedule, uint8_t *out, size_t out_len) {
205	0	__m128i block;
206	0	__m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
207	0	if (iv_len == 12) {
208	0	const int32_t ivi = (const int32_t ) iv;
209	0	block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]);
210	0	} else if (iv_len == 16) {
211	0	block = _mm_loadu_si128((const __m128i *)iv);
212	0	} else {
213	0	exit(EXIT_FAILURE);
214	0	}
215
216	0	while (out_len >= 64) {
217	0	__m128i nv0 = block;
218	0	__m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
219	0	__m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask);
220	0	__m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask);
221	0	aes256ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
222	0	block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask);
223	0	out += 64;
224	0	out_len -= 64;
225	0	}
226	0	while (out_len >= 16) {
227	0	aes256ni_encrypt(schedule, block, out);
228	0	out += 16;
229	0	out_len -= 16;
230	0	block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
231	0	}
232	0	if (out_len > 0) {
233	0	uint8_t tmp[16];
234	0	aes256ni_encrypt(schedule, block, tmp);
235	0	memcpy(out, tmp, out_len);
236	0	}
237	0	}