/src/libgcrypt/cipher/poly1305.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* poly1305.c - Poly1305 internals and generic implementation |
2 | | * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
3 | | * |
4 | | * This file is part of Libgcrypt. |
5 | | * |
6 | | * Libgcrypt is free software; you can redistribute it and/or modify |
7 | | * it under the terms of the GNU Lesser general Public License as |
8 | | * published by the Free Software Foundation; either version 2.1 of |
9 | | * the License, or (at your option) any later version. |
10 | | * |
11 | | * Libgcrypt is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with this program; if not, see <http://www.gnu.org/licenses/>. |
18 | | */ |
19 | | |
20 | | #include <config.h> |
21 | | #include <stdio.h> |
22 | | #include <stdlib.h> |
23 | | #include <string.h> |
24 | | |
25 | | #include "types.h" |
26 | | #include "g10lib.h" |
27 | | #include "cipher.h" |
28 | | #include "bufhelp.h" |
29 | | #include "poly1305-internal.h" |
30 | | |
31 | | #include "mpi-internal.h" |
32 | | #include "longlong.h" |
33 | | |
34 | | |
35 | | static const char *selftest (void); |
36 | | |
37 | | |
38 | | #undef HAVE_ASM_POLY1305_BLOCKS |
39 | | |
40 | | |
41 | | #undef USE_MPI_64BIT |
42 | | #undef USE_MPI_32BIT |
43 | | #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64) |
44 | | # define USE_MPI_64BIT 1 |
45 | | #elif BYTES_PER_MPI_LIMB == 4 |
46 | | # define USE_MPI_32BIT 1 |
47 | | #else |
48 | | # error please implement for this limb size. |
49 | | #endif |
50 | | |
51 | | |
52 | | /* USE_S390X_ASM indicates whether to enable zSeries code. */ |
53 | | #undef USE_S390X_ASM |
54 | | #if BYTES_PER_MPI_LIMB == 8 |
55 | | # if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 |
56 | | # if defined(HAVE_GCC_INLINE_ASM_S390X) |
57 | | # define USE_S390X_ASM 1 |
58 | | # endif /* USE_S390X_ASM */ |
59 | | # endif |
60 | | #endif |
61 | | |
62 | | |
63 | | /* AMD64 Assembly implementations use SystemV ABI, ABI conversion and |
64 | | * additional stack to store XMM6-XMM15 needed on Win64. */ |
65 | | #undef ASM_FUNC_ABI |
66 | | #undef ASM_FUNC_WRAPPER_ATTR |
67 | | #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) |
68 | | # define ASM_FUNC_ABI __attribute__((sysv_abi)) |
69 | | # define ASM_FUNC_WRAPPER_ATTR __attribute__((noinline)) |
70 | | #else |
71 | | # define ASM_FUNC_ABI |
72 | | # define ASM_FUNC_WRAPPER_ATTR |
73 | | #endif |
74 | | |
75 | | |
76 | | #ifdef USE_S390X_ASM |
77 | | |
78 | | #define HAVE_ASM_POLY1305_BLOCKS 1 |
79 | | |
80 | | extern unsigned int _gcry_poly1305_s390x_blocks1(void *state, |
81 | | const byte *buf, size_t len, |
82 | | byte high_pad); |
83 | | |
84 | | static unsigned int |
85 | | poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, |
86 | | byte high_pad) |
87 | | { |
88 | | return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad); |
89 | | } |
90 | | |
91 | | #endif /* USE_S390X_ASM */ |
92 | | |
93 | | |
94 | | #ifdef POLY1305_USE_AVX512 |
95 | | |
96 | | extern unsigned int |
97 | | _gcry_poly1305_amd64_avx512_blocks(const void *msg, const u64 msg_len, |
98 | | void *hash, const void *key) ASM_FUNC_ABI; |
99 | | |
100 | | ASM_FUNC_WRAPPER_ATTR static unsigned int |
101 | | poly1305_amd64_avx512_blocks(poly1305_context_t *ctx, const byte *buf, |
102 | | size_t len) |
103 | 0 | { |
104 | 0 | POLY1305_STATE *st = &ctx->state; |
105 | 0 | return _gcry_poly1305_amd64_avx512_blocks(buf, len, st->h, st->r); |
106 | 0 | } |
107 | | |
108 | | #endif /* POLY1305_USE_AVX512 */ |
109 | | |
110 | | |
111 | | #ifdef POLY1305_USE_PPC_VEC |
112 | | |
113 | | extern unsigned int |
114 | | gcry_poly1305_p10le_4blocks(unsigned char *key, const byte *m, size_t len); |
115 | | |
116 | | #endif /* POLY1305_USE_PPC_VEC */ |
117 | | |
118 | | |
119 | | static void poly1305_init (poly1305_context_t *ctx, |
120 | | const byte key[POLY1305_KEYLEN]) |
121 | 0 | { |
122 | 0 | POLY1305_STATE *st = &ctx->state; |
123 | 0 | unsigned int features = _gcry_get_hw_features (); |
124 | |
|
125 | 0 | #ifdef POLY1305_USE_AVX512 |
126 | 0 | ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0; |
127 | 0 | #endif |
128 | |
|
129 | | #ifdef POLY1305_USE_PPC_VEC |
130 | | ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; |
131 | | # ifdef ENABLE_FORCE_SOFT_HWFEATURES |
132 | | /* HWF_PPC_ARCH_3_10 above is used as soft HW-feature indicator for P10. |
133 | | * Actual implementation works with HWF_PPC_ARCH_3_00 also. */ |
134 | | ctx->use_p10 |= (features & HWF_PPC_ARCH_3_00) != 0; |
135 | | # endif |
136 | | #endif |
137 | |
|
138 | 0 | (void)features; |
139 | |
|
140 | 0 | ctx->leftover = 0; |
141 | |
|
142 | 0 | st->h[0] = 0; |
143 | 0 | st->h[1] = 0; |
144 | 0 | st->h[2] = 0; |
145 | 0 | st->h[3] = 0; |
146 | 0 | st->h[4] = 0; |
147 | |
|
148 | 0 | st->r[0] = buf_get_le32(key + 0) & 0x0fffffff; |
149 | 0 | st->r[1] = buf_get_le32(key + 4) & 0x0ffffffc; |
150 | 0 | st->r[2] = buf_get_le32(key + 8) & 0x0ffffffc; |
151 | 0 | st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc; |
152 | |
|
153 | 0 | st->k[0] = buf_get_le32(key + 16); |
154 | 0 | st->k[1] = buf_get_le32(key + 20); |
155 | 0 | st->k[2] = buf_get_le32(key + 24); |
156 | 0 | st->k[3] = buf_get_le32(key + 28); |
157 | 0 | } |
158 | | |
159 | | |
160 | | #ifdef USE_MPI_64BIT |
161 | | |
162 | | #if defined (__aarch64__) && defined(HAVE_CPU_ARCH_ARM) && __GNUC__ >= 4 |
163 | | |
164 | | /* A += B (armv8/aarch64) */ |
165 | | #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ |
166 | | __asm__ ("adds %0, %3, %0\n" \ |
167 | | "adcs %1, %4, %1\n" \ |
168 | | "adc %2, %5, %2\n" \ |
169 | | : "+r" (A0), "+r" (A1), "+r" (A2) \ |
170 | | : "r" (B0), "r" (B1), "r" (B2) \ |
171 | | : "cc" ) |
172 | | |
173 | | #endif /* __aarch64__ */ |
174 | | |
175 | | #if defined (__x86_64__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 4 |
176 | | |
177 | | /* A += B (x86-64) */ |
178 | | #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ |
179 | 0 | __asm__ ("addq %3, %0\n" \ |
180 | 0 | "adcq %4, %1\n" \ |
181 | 0 | "adcq %5, %2\n" \ |
182 | 0 | : "+r" (A0), "+r" (A1), "+r" (A2) \ |
183 | 0 | : "g" (B0), "g" (B1), "g" (B2) \ |
184 | 0 | : "cc" ) |
185 | | |
186 | | #endif /* __x86_64__ */ |
187 | | |
188 | | #if defined (__powerpc__) && defined(HAVE_CPU_ARCH_PPC) && __GNUC__ >= 4 |
189 | | |
190 | | /* A += B (ppc64) */ |
191 | | #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \ |
192 | | __asm__ ("addc %0, %3, %0\n" \ |
193 | | "adde %1, %4, %1\n" \ |
194 | | "adde %2, %5, %2\n" \ |
195 | | : "+r" (A0), "+r" (A1), "+r" (A2) \ |
196 | | : "r" (B0), "r" (B1), "r" (B2) \ |
197 | | : "cc" ) |
198 | | |
199 | | #endif /* __powerpc__ */ |
200 | | |
201 | | #ifndef ADD_1305_64 |
202 | | /* A += B (generic, mpi) */ |
203 | | # define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \ |
204 | | u64 carry; \ |
205 | | add_ssaaaa(carry, A0, 0, A0, 0, B0); \ |
206 | | add_ssaaaa(A2, A1, A2, A1, B2, B1); \ |
207 | | add_ssaaaa(A2, A1, A2, A1, 0, carry); \ |
208 | | } while (0) |
209 | | #endif |
210 | | |
211 | | /* H = H * R mod 2¹³⁰-5 */ |
212 | 0 | #define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \ |
213 | 0 | u64 x0_lo, x0_hi, x1_lo, x1_hi; \ |
214 | 0 | u64 t0_lo, t0_hi, t1_lo, t1_hi; \ |
215 | 0 | \ |
216 | 0 | /* x = a * r (partial mod 2^130-5) */ \ |
217 | 0 | umul_ppmm(x0_hi, x0_lo, H0, R0); /* h0 * r0 */ \ |
218 | 0 | umul_ppmm(x1_hi, x1_lo, H0, R1); /* h0 * r1 */ \ |
219 | 0 | \ |
220 | 0 | umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \ |
221 | 0 | add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \ |
222 | 0 | umul_ppmm(t1_hi, t1_lo, H1, R0); /* h1 * r0 */ \ |
223 | 0 | add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \ |
224 | 0 | \ |
225 | 0 | t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \ |
226 | 0 | t1_hi = H2 * R0; /* h2 * r0 */ \ |
227 | 0 | add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \ |
228 | 0 | \ |
229 | 0 | /* carry propagation */ \ |
230 | 0 | H2 = H0 & 3; \ |
231 | 0 | H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \ |
232 | 0 | ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \ |
233 | 0 | } while (0) |
234 | | |
235 | | #ifndef HAVE_ASM_POLY1305_BLOCKS |
236 | | |
237 | | static unsigned int |
238 | | poly1305_blocks_generic (poly1305_context_t *ctx, const byte *buf, size_t len, |
239 | | byte high_pad) |
240 | 0 | { |
241 | 0 | POLY1305_STATE *st = &ctx->state; |
242 | 0 | u64 r0, r1, r1_mult5; |
243 | 0 | u64 h0, h1, h2; |
244 | 0 | u64 m0, m1, m2; |
245 | |
|
246 | 0 | m2 = high_pad; |
247 | |
|
248 | 0 | h0 = st->h[0] + ((u64)st->h[1] << 32); |
249 | 0 | h1 = st->h[2] + ((u64)st->h[3] << 32); |
250 | 0 | h2 = st->h[4]; |
251 | |
|
252 | 0 | r0 = st->r[0] + ((u64)st->r[1] << 32); |
253 | 0 | r1 = st->r[2] + ((u64)st->r[3] << 32); |
254 | |
|
255 | 0 | r1_mult5 = (r1 >> 2) + r1; |
256 | |
|
257 | 0 | m0 = buf_get_le64(buf + 0); |
258 | 0 | m1 = buf_get_le64(buf + 8); |
259 | 0 | buf += POLY1305_BLOCKSIZE; |
260 | 0 | len -= POLY1305_BLOCKSIZE; |
261 | |
|
262 | 0 | while (len >= POLY1305_BLOCKSIZE) |
263 | 0 | { |
264 | | /* a = h + m */ |
265 | 0 | ADD_1305_64(h2, h1, h0, m2, m1, m0); |
266 | |
|
267 | 0 | m0 = buf_get_le64(buf + 0); |
268 | 0 | m1 = buf_get_le64(buf + 8); |
269 | | |
270 | | /* h = a * r (partial mod 2^130-5) */ |
271 | 0 | MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5); |
272 | |
|
273 | 0 | buf += POLY1305_BLOCKSIZE; |
274 | 0 | len -= POLY1305_BLOCKSIZE; |
275 | 0 | } |
276 | | |
277 | | /* a = h + m */ |
278 | 0 | ADD_1305_64(h2, h1, h0, m2, m1, m0); |
279 | | |
280 | | /* h = a * r (partial mod 2^130-5) */ |
281 | 0 | MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5); |
282 | |
|
283 | 0 | st->h[0] = h0; |
284 | 0 | st->h[1] = h0 >> 32; |
285 | 0 | st->h[2] = h1; |
286 | 0 | st->h[3] = h1 >> 32; |
287 | 0 | st->h[4] = h2; |
288 | |
|
289 | 0 | return 6 * sizeof (void *) + 18 * sizeof (u64); |
290 | 0 | } |
291 | | |
292 | | static unsigned int |
293 | | poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, |
294 | | byte high_pad) |
295 | 0 | { |
296 | 0 | #ifdef POLY1305_USE_AVX512 |
297 | 0 | if ((high_pad & ctx->use_avx512) != 0) |
298 | 0 | return poly1305_amd64_avx512_blocks(ctx, buf, len); |
299 | 0 | #endif |
300 | | |
301 | 0 | return poly1305_blocks_generic(ctx, buf, len, high_pad); |
302 | 0 | } |
303 | | |
304 | | #endif /* !HAVE_ASM_POLY1305_BLOCKS */ |
305 | | |
306 | | static unsigned int poly1305_final (poly1305_context_t *ctx, |
307 | | byte mac[POLY1305_TAGLEN]) |
308 | 0 | { |
309 | 0 | POLY1305_STATE *st = &ctx->state; |
310 | 0 | unsigned int burn = 0; |
311 | 0 | u64 u, carry; |
312 | 0 | u64 k0, k1; |
313 | 0 | u64 h0, h1; |
314 | 0 | u64 h2; |
315 | | |
316 | | /* process the remaining block */ |
317 | 0 | if (ctx->leftover) |
318 | 0 | { |
319 | 0 | ctx->buffer[ctx->leftover++] = 1; |
320 | 0 | if (ctx->leftover < POLY1305_BLOCKSIZE) |
321 | 0 | { |
322 | 0 | memset (&ctx->buffer[ctx->leftover], 0, |
323 | 0 | POLY1305_BLOCKSIZE - ctx->leftover); |
324 | 0 | ctx->leftover = POLY1305_BLOCKSIZE; |
325 | 0 | } |
326 | 0 | burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0); |
327 | 0 | } |
328 | |
|
329 | 0 | h0 = st->h[0] + ((u64)st->h[1] << 32); |
330 | 0 | h1 = st->h[2] + ((u64)st->h[3] << 32); |
331 | 0 | h2 = st->h[4]; |
332 | |
|
333 | 0 | k0 = st->k[0] + ((u64)st->k[1] << 32); |
334 | 0 | k1 = st->k[2] + ((u64)st->k[3] << 32); |
335 | | |
336 | | /* check if h is more than 2^130-5, by adding 5. */ |
337 | 0 | add_ssaaaa(carry, u, 0, h0, 0, 5); |
338 | 0 | add_ssaaaa(carry, u, 0, carry, 0, h1); |
339 | 0 | u = (carry + h2) >> 2; /* u == 0 or 1 */ |
340 | | |
341 | | /* minus 2^130-5 ... (+5) */ |
342 | 0 | u = (-u) & 5; |
343 | 0 | add_ssaaaa(h1, h0, h1, h0, 0, u); |
344 | | |
345 | | /* add high part of key + h */ |
346 | 0 | add_ssaaaa(h1, h0, h1, h0, k1, k0); |
347 | 0 | buf_put_le64(mac + 0, h0); |
348 | 0 | buf_put_le64(mac + 8, h1); |
349 | | |
350 | | /* burn_stack */ |
351 | 0 | return 4 * sizeof (void *) + 7 * sizeof (u64) + burn; |
352 | 0 | } |
353 | | |
354 | | #endif /* USE_MPI_64BIT */ |
355 | | |
356 | | #ifdef USE_MPI_32BIT |
357 | | |
358 | | #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS |
359 | | |
360 | | /* HI:LO += A * B (arm) */ |
361 | | #define UMUL_ADD_32(HI, LO, A, B) \ |
362 | | __asm__ ("umlal %1, %0, %4, %5" \ |
363 | | : "=r" (HI), "=r" (LO) \ |
364 | | : "0" (HI), "1" (LO), "r" (A), "r" (B) ) |
365 | | |
366 | | /* A += B (arm) */ |
367 | | #ifdef __GCC_ASM_FLAG_OUTPUTS__ |
368 | | # define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \ |
369 | | u32 __carry; \ |
370 | | __asm__ ("adds %0, %0, %5\n" \ |
371 | | "adcs %1, %1, %6\n" \ |
372 | | "adcs %2, %2, %7\n" \ |
373 | | "adcs %3, %3, %8\n" \ |
374 | | : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), \ |
375 | | "=@cccs" (__carry) \ |
376 | | : "r" (B0), "r" (B1), "r" (B2), "r" (B3) \ |
377 | | : ); \ |
378 | | (A4) += (B4) + __carry; \ |
379 | | } while (0) |
380 | | #else |
381 | | # define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \ |
382 | | u32 __carry = (B0); \ |
383 | | __asm__ ("adds %0, %0, %2\n" \ |
384 | | "adcs %1, %1, %3\n" \ |
385 | | "rrx %2, %2\n" /* carry to 31th bit */ \ |
386 | | : "+r" (A0), "+r" (A1), "+r" (__carry) \ |
387 | | : "r" (B1), "r" (0) \ |
388 | | : "cc" ); \ |
389 | | __asm__ ("lsls %0, %0, #1\n" /* carry from 31th bit */ \ |
390 | | "adcs %1, %1, %4\n" \ |
391 | | "adcs %2, %2, %5\n" \ |
392 | | "adc %3, %3, %6\n" \ |
393 | | : "+r" (__carry), "+r" (A2), "+r" (A3), "+r" (A4) \ |
394 | | : "r" (B2), "r" (B3), "r" (B4) \ |
395 | | : "cc" ); \ |
396 | | } while (0) |
397 | | #endif |
398 | | |
399 | | #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */ |
400 | | |
401 | | #if defined (__i386__) && defined(HAVE_CPU_ARCH_X86) && __GNUC__ >= 5 |
402 | | /* Note: ADD_1305_32 below does not compile on GCC-4.7 */ |
403 | | |
404 | | /* A += B (i386) */ |
405 | | #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \ |
406 | | __asm__ ("addl %5, %0\n" \ |
407 | | "adcl %6, %1\n" \ |
408 | | "adcl %7, %2\n" \ |
409 | | "adcl %8, %3\n" \ |
410 | | "adcl %9, %4\n" \ |
411 | | : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \ |
412 | | : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \ |
413 | | : "cc" ) |
414 | | |
415 | | #endif /* __i386__ */ |
416 | | |
417 | | #ifndef UMUL_ADD_32 |
418 | | /* HI:LO += A * B (generic, mpi) */ |
419 | | # define UMUL_ADD_32(HI, LO, A, B) do { \ |
420 | | u32 t_lo, t_hi; \ |
421 | | umul_ppmm(t_hi, t_lo, A, B); \ |
422 | | add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \ |
423 | | } while (0) |
424 | | #endif |
425 | | |
426 | | #ifndef ADD_1305_32 |
427 | | /* A += B (generic, mpi) */ |
428 | | # define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \ |
429 | | u32 carry0, carry1, carry2; \ |
430 | | add_ssaaaa(carry0, A0, 0, A0, 0, B0); \ |
431 | | add_ssaaaa(carry1, A1, 0, A1, 0, B1); \ |
432 | | add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \ |
433 | | add_ssaaaa(carry2, A2, 0, A2, 0, B2); \ |
434 | | add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \ |
435 | | add_ssaaaa(A4, A3, A4, A3, B4, B3); \ |
436 | | add_ssaaaa(A4, A3, A4, A3, 0, carry2); \ |
437 | | } while (0) |
438 | | #endif |
439 | | |
440 | | /* H = H * R mod 2¹³⁰-5 */ |
441 | | #define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \ |
442 | | R3_MULT5, R2_MULT5, R1_MULT5) do { \ |
443 | | u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \ |
444 | | u32 t0_lo, t0_hi; \ |
445 | | \ |
446 | | /* x = a * r (partial mod 2^130-5) */ \ |
447 | | umul_ppmm(x0_hi, x0_lo, H0, R0); /* h0 * r0 */ \ |
448 | | umul_ppmm(x1_hi, x1_lo, H0, R1); /* h0 * r1 */ \ |
449 | | umul_ppmm(x2_hi, x2_lo, H0, R2); /* h0 * r2 */ \ |
450 | | umul_ppmm(x3_hi, x3_lo, H0, R3); /* h0 * r3 */ \ |
451 | | \ |
452 | | UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \ |
453 | | UMUL_ADD_32(x1_hi, x1_lo, H1, R0); /* h1 * r0 */ \ |
454 | | UMUL_ADD_32(x2_hi, x2_lo, H1, R1); /* h1 * r1 */ \ |
455 | | UMUL_ADD_32(x3_hi, x3_lo, H1, R2); /* h1 * r2 */ \ |
456 | | \ |
457 | | UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \ |
458 | | UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \ |
459 | | UMUL_ADD_32(x2_hi, x2_lo, H2, R0); /* h2 * r0 */ \ |
460 | | UMUL_ADD_32(x3_hi, x3_lo, H2, R1); /* h2 * r1 */ \ |
461 | | \ |
462 | | UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \ |
463 | | H1 = x0_hi; \ |
464 | | UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \ |
465 | | UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \ |
466 | | UMUL_ADD_32(x3_hi, x3_lo, H3, R0); /* h3 * r0 */ \ |
467 | | \ |
468 | | t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \ |
469 | | t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \ |
470 | | add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \ |
471 | | add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \ |
472 | | t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \ |
473 | | t0_hi = H4 * R0; /* h4 * r0 */ \ |
474 | | add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \ |
475 | | \ |
476 | | /* carry propagation */ \ |
477 | | H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \ |
478 | | H4 = H4 & 3; \ |
479 | | ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \ |
480 | | } while (0) |
481 | | |
482 | | #ifndef HAVE_ASM_POLY1305_BLOCKS |
483 | | |
484 | | static unsigned int |
485 | | poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, |
486 | | byte high_pad) |
487 | | { |
488 | | POLY1305_STATE *st = &ctx->state; |
489 | | u32 r1_mult5, r2_mult5, r3_mult5; |
490 | | u32 h0, h1, h2, h3, h4; |
491 | | u32 m0, m1, m2, m3, m4; |
492 | | |
493 | | m4 = high_pad; |
494 | | |
495 | | h0 = st->h[0]; |
496 | | h1 = st->h[1]; |
497 | | h2 = st->h[2]; |
498 | | h3 = st->h[3]; |
499 | | h4 = st->h[4]; |
500 | | |
501 | | r1_mult5 = (st->r[1] >> 2) + st->r[1]; |
502 | | r2_mult5 = (st->r[2] >> 2) + st->r[2]; |
503 | | r3_mult5 = (st->r[3] >> 2) + st->r[3]; |
504 | | |
505 | | while (len >= POLY1305_BLOCKSIZE) |
506 | | { |
507 | | m0 = buf_get_le32(buf + 0); |
508 | | m1 = buf_get_le32(buf + 4); |
509 | | m2 = buf_get_le32(buf + 8); |
510 | | m3 = buf_get_le32(buf + 12); |
511 | | |
512 | | /* a = h + m */ |
513 | | ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0); |
514 | | |
515 | | /* h = a * r (partial mod 2^130-5) */ |
516 | | MUL_MOD_1305_32(h4, h3, h2, h1, h0, |
517 | | st->r[3], st->r[2], st->r[1], st->r[0], |
518 | | r3_mult5, r2_mult5, r1_mult5); |
519 | | |
520 | | buf += POLY1305_BLOCKSIZE; |
521 | | len -= POLY1305_BLOCKSIZE; |
522 | | } |
523 | | |
524 | | st->h[0] = h0; |
525 | | st->h[1] = h1; |
526 | | st->h[2] = h2; |
527 | | st->h[3] = h3; |
528 | | st->h[4] = h4; |
529 | | |
530 | | return 6 * sizeof (void *) + 28 * sizeof (u32); |
531 | | } |
532 | | |
533 | | #endif /* !HAVE_ASM_POLY1305_BLOCKS */ |
534 | | |
535 | | static unsigned int poly1305_final (poly1305_context_t *ctx, |
536 | | byte mac[POLY1305_TAGLEN]) |
537 | | { |
538 | | POLY1305_STATE *st = &ctx->state; |
539 | | unsigned int burn = 0; |
540 | | u32 carry, tmp0, tmp1, tmp2, u; |
541 | | u32 h4, h3, h2, h1, h0; |
542 | | |
543 | | /* process the remaining block */ |
544 | | if (ctx->leftover) |
545 | | { |
546 | | ctx->buffer[ctx->leftover++] = 1; |
547 | | if (ctx->leftover < POLY1305_BLOCKSIZE) |
548 | | { |
549 | | memset (&ctx->buffer[ctx->leftover], 0, |
550 | | POLY1305_BLOCKSIZE - ctx->leftover); |
551 | | ctx->leftover = POLY1305_BLOCKSIZE; |
552 | | } |
553 | | burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0); |
554 | | } |
555 | | |
556 | | h0 = st->h[0]; |
557 | | h1 = st->h[1]; |
558 | | h2 = st->h[2]; |
559 | | h3 = st->h[3]; |
560 | | h4 = st->h[4]; |
561 | | |
562 | | /* check if h is more than 2^130-5, by adding 5. */ |
563 | | add_ssaaaa(carry, tmp0, 0, h0, 0, 5); |
564 | | add_ssaaaa(carry, tmp0, 0, carry, 0, h1); |
565 | | add_ssaaaa(carry, tmp0, 0, carry, 0, h2); |
566 | | add_ssaaaa(carry, tmp0, 0, carry, 0, h3); |
567 | | u = (carry + h4) >> 2; /* u == 0 or 1 */ |
568 | | |
569 | | /* minus 2^130-5 ... (+5) */ |
570 | | u = (-u) & 5; |
571 | | add_ssaaaa(carry, h0, 0, h0, 0, u); |
572 | | add_ssaaaa(carry, h1, 0, h1, 0, carry); |
573 | | add_ssaaaa(carry, h2, 0, h2, 0, carry); |
574 | | add_ssaaaa(carry, h3, 0, h3, 0, carry); |
575 | | |
576 | | /* add high part of key + h */ |
577 | | add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]); |
578 | | add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]); |
579 | | add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0); |
580 | | add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]); |
581 | | add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1); |
582 | | add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]); |
583 | | h3 += tmp2; |
584 | | |
585 | | buf_put_le32(mac + 0, h0); |
586 | | buf_put_le32(mac + 4, h1); |
587 | | buf_put_le32(mac + 8, h2); |
588 | | buf_put_le32(mac + 12, h3); |
589 | | |
590 | | /* burn_stack */ |
591 | | return 4 * sizeof (void *) + 10 * sizeof (u32) + burn; |
592 | | } |
593 | | |
594 | | #endif /* USE_MPI_32BIT */ |
595 | | |
596 | | |
597 | | unsigned int |
598 | | _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m, |
599 | | size_t bytes) |
600 | 0 | { |
601 | 0 | unsigned int burn = 0; |
602 | 0 | unsigned int nburn; |
603 | | |
604 | | /* handle leftover */ |
605 | 0 | if (ctx->leftover) |
606 | 0 | { |
607 | 0 | size_t want = (POLY1305_BLOCKSIZE - ctx->leftover); |
608 | 0 | if (want > bytes) |
609 | 0 | want = bytes; |
610 | 0 | buf_cpy (ctx->buffer + ctx->leftover, m, want); |
611 | 0 | bytes -= want; |
612 | 0 | m += want; |
613 | 0 | ctx->leftover += want; |
614 | 0 | if (ctx->leftover < POLY1305_BLOCKSIZE) |
615 | 0 | return 0; |
616 | 0 | nburn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1); |
617 | 0 | burn = nburn > burn ? nburn : burn; |
618 | 0 | ctx->leftover = 0; |
619 | 0 | } |
620 | | |
621 | | #ifdef POLY1305_USE_PPC_VEC |
622 | | /* PPC-P10/little-endian: bulk process multiples of eight blocks */ |
623 | | if (ctx->use_p10 && bytes >= POLY1305_BLOCKSIZE * 8) |
624 | | { |
625 | | size_t nblks = bytes / (POLY1305_BLOCKSIZE * 8); |
626 | | size_t len = nblks * (POLY1305_BLOCKSIZE * 8); |
627 | | POLY1305_STATE *st = &ctx->state; |
628 | | nburn = gcry_poly1305_p10le_4blocks ((unsigned char *) st, m, len); |
629 | | burn = nburn > burn ? nburn : burn; |
630 | | m += len; |
631 | | bytes -= len; |
632 | | } |
633 | | #endif /* POLY1305_USE_PPC_VEC */ |
634 | | |
635 | | /* process full blocks */ |
636 | 0 | if (bytes >= POLY1305_BLOCKSIZE) |
637 | 0 | { |
638 | 0 | size_t nblks = bytes / POLY1305_BLOCKSIZE; |
639 | 0 | nburn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1); |
640 | 0 | burn = nburn > burn ? nburn : burn; |
641 | 0 | m += nblks * POLY1305_BLOCKSIZE; |
642 | 0 | bytes -= nblks * POLY1305_BLOCKSIZE; |
643 | 0 | } |
644 | | |
645 | | /* store leftover */ |
646 | 0 | if (bytes) |
647 | 0 | { |
648 | 0 | buf_cpy (ctx->buffer + ctx->leftover, m, bytes); |
649 | 0 | ctx->leftover += bytes; |
650 | 0 | } |
651 | |
|
652 | 0 | return burn; |
653 | 0 | } |
654 | | |
655 | | |
656 | | void |
657 | | _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes) |
658 | 0 | { |
659 | 0 | unsigned int burn; |
660 | |
|
661 | 0 | burn = _gcry_poly1305_update_burn (ctx, m, bytes); |
662 | |
|
663 | 0 | if (burn) |
664 | 0 | _gcry_burn_stack (burn); |
665 | 0 | } |
666 | | |
667 | | |
668 | | void |
669 | | _gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN]) |
670 | 0 | { |
671 | 0 | unsigned int burn; |
672 | |
|
673 | 0 | burn = poly1305_final (ctx, mac); |
674 | |
|
675 | 0 | _gcry_burn_stack (burn); |
676 | 0 | } |
677 | | |
678 | | |
679 | | gcry_err_code_t |
680 | | _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, |
681 | | size_t keylen) |
682 | 0 | { |
683 | 0 | static int initialized; |
684 | 0 | static const char *selftest_failed; |
685 | |
|
686 | 0 | if (!initialized) |
687 | 0 | { |
688 | 0 | initialized = 1; |
689 | 0 | selftest_failed = selftest (); |
690 | 0 | if (selftest_failed) |
691 | 0 | log_error ("Poly1305 selftest failed (%s)\n", selftest_failed); |
692 | 0 | } |
693 | |
|
694 | 0 | if (keylen != POLY1305_KEYLEN) |
695 | 0 | return GPG_ERR_INV_KEYLEN; |
696 | | |
697 | 0 | if (selftest_failed) |
698 | 0 | return GPG_ERR_SELFTEST_FAILED; |
699 | | |
700 | 0 | poly1305_init (ctx, key); |
701 | |
|
702 | 0 | return 0; |
703 | 0 | } |
704 | | |
705 | | |
706 | | static void |
707 | | poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes, |
708 | | const byte * key) |
709 | 0 | { |
710 | 0 | poly1305_context_t ctx; |
711 | |
|
712 | 0 | memset (&ctx, 0, sizeof (ctx)); |
713 | |
|
714 | 0 | _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN); |
715 | 0 | _gcry_poly1305_update (&ctx, m, bytes); |
716 | 0 | _gcry_poly1305_finish (&ctx, mac); |
717 | |
|
718 | 0 | wipememory (&ctx, sizeof (ctx)); |
719 | 0 | } |
720 | | |
721 | | |
722 | | static const char * |
723 | | selftest (void) |
724 | 0 | { |
725 | | /* example from nacl */ |
726 | 0 | static const byte nacl_key[POLY1305_KEYLEN] = { |
727 | 0 | 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91, |
728 | 0 | 0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25, |
729 | 0 | 0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65, |
730 | 0 | 0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80, |
731 | 0 | }; |
732 | |
|
733 | 0 | static const byte nacl_msg[131] = { |
734 | 0 | 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73, |
735 | 0 | 0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce, |
736 | 0 | 0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4, |
737 | 0 | 0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a, |
738 | 0 | 0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b, |
739 | 0 | 0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72, |
740 | 0 | 0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2, |
741 | 0 | 0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38, |
742 | 0 | 0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a, |
743 | 0 | 0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae, |
744 | 0 | 0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea, |
745 | 0 | 0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda, |
746 | 0 | 0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde, |
747 | 0 | 0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3, |
748 | 0 | 0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6, |
749 | 0 | 0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74, |
750 | 0 | 0xe3, 0x55, 0xa5 |
751 | 0 | }; |
752 | |
|
753 | 0 | static const byte nacl_mac[16] = { |
754 | 0 | 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5, |
755 | 0 | 0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9 |
756 | 0 | }; |
757 | | |
758 | | /* generates a final value of (2^130 - 2) == 3 */ |
759 | 0 | static const byte wrap_key[POLY1305_KEYLEN] = { |
760 | 0 | 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
761 | 0 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
762 | 0 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
763 | 0 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
764 | 0 | }; |
765 | |
|
766 | 0 | static const byte wrap_msg[16] = { |
767 | 0 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
768 | 0 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
769 | 0 | }; |
770 | |
|
771 | 0 | static const byte wrap_mac[16] = { |
772 | 0 | 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
773 | 0 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
774 | 0 | }; |
775 | | |
776 | | /* mac of the macs of messages of length 0 to 256, where the key and messages |
777 | | * have all their values set to the length |
778 | | */ |
779 | 0 | static const byte total_key[POLY1305_KEYLEN] = { |
780 | 0 | 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
781 | 0 | 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, |
782 | 0 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
783 | 0 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
784 | 0 | }; |
785 | |
|
786 | 0 | static const byte total_mac[16] = { |
787 | 0 | 0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd, |
788 | 0 | 0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39 |
789 | 0 | }; |
790 | |
|
791 | 0 | poly1305_context_t ctx; |
792 | 0 | poly1305_context_t total_ctx; |
793 | 0 | byte all_key[POLY1305_KEYLEN]; |
794 | 0 | byte all_msg[256]; |
795 | 0 | byte mac[16]; |
796 | 0 | size_t i, j; |
797 | |
|
798 | 0 | memset (&ctx, 0, sizeof (ctx)); |
799 | 0 | memset (&total_ctx, 0, sizeof (total_ctx)); |
800 | |
|
801 | 0 | memset (mac, 0, sizeof (mac)); |
802 | 0 | poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key); |
803 | 0 | if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) |
804 | 0 | return "Poly1305 test 1 failed."; |
805 | | |
806 | | /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so |
807 | | * make sure everything still works varying between them */ |
808 | 0 | memset (mac, 0, sizeof (mac)); |
809 | 0 | _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN); |
810 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 0, 32); |
811 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 32, 64); |
812 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 96, 16); |
813 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 112, 8); |
814 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 120, 4); |
815 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 124, 2); |
816 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 126, 1); |
817 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 127, 1); |
818 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 128, 1); |
819 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 129, 1); |
820 | 0 | _gcry_poly1305_update (&ctx, nacl_msg + 130, 1); |
821 | 0 | _gcry_poly1305_finish (&ctx, mac); |
822 | 0 | if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0) |
823 | 0 | return "Poly1305 test 2 failed."; |
824 | | |
825 | 0 | memset (mac, 0, sizeof (mac)); |
826 | 0 | poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key); |
827 | 0 | if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0) |
828 | 0 | return "Poly1305 test 3 failed."; |
829 | | |
830 | 0 | _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN); |
831 | 0 | for (i = 0; i < 256; i++) |
832 | 0 | { |
833 | | /* set key and message to 'i,i,i..' */ |
834 | 0 | for (j = 0; j < sizeof (all_key); j++) |
835 | 0 | all_key[j] = i; |
836 | 0 | for (j = 0; j < i; j++) |
837 | 0 | all_msg[j] = i; |
838 | 0 | poly1305_auth (mac, all_msg, i, all_key); |
839 | 0 | _gcry_poly1305_update (&total_ctx, mac, 16); |
840 | 0 | } |
841 | 0 | _gcry_poly1305_finish (&total_ctx, mac); |
842 | 0 | if (memcmp (total_mac, mac, sizeof (total_mac)) != 0) |
843 | 0 | return "Poly1305 test 4 failed."; |
844 | | |
845 | 0 | return NULL; |
846 | 0 | } |