/src/openssl/crypto/poly1305/poly1305.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.  | 
3  |  |  *  | 
4  |  |  * Licensed under the Apache License 2.0 (the "License").  You may not use  | 
5  |  |  * this file except in compliance with the License.  You can obtain a copy  | 
6  |  |  * in the file LICENSE in the source distribution or at  | 
7  |  |  * https://www.openssl.org/source/license.html  | 
8  |  |  */  | 
9  |  |  | 
10  |  | #include <stdlib.h>  | 
11  |  | #include <string.h>  | 
12  |  | #include <openssl/crypto.h>  | 
13  |  |  | 
14  |  | #include "crypto/poly1305.h"  | 
15  |  |  | 
16  |  | size_t Poly1305_ctx_size(void)  | 
17  | 0  | { | 
18  | 0  |     return sizeof(struct poly1305_context);  | 
19  | 0  | }  | 
20  |  |  | 
21  |  | /* pick 32-bit unsigned integer in little endian order */  | 
22  |  | static unsigned int U8TOU32(const unsigned char *p)  | 
23  | 0  | { | 
24  | 0  |     return (((unsigned int)(p[0] & 0xff)) |  | 
25  | 0  |             ((unsigned int)(p[1] & 0xff) << 8) |  | 
26  | 0  |             ((unsigned int)(p[2] & 0xff) << 16) |  | 
27  | 0  |             ((unsigned int)(p[3] & 0xff) << 24));  | 
28  | 0  | }  | 
29  |  |  | 
30  |  | /*  | 
31  |  |  * Implementations can be classified by amount of significant bits in  | 
32  |  |  * words making up the multi-precision value, or in other words radix  | 
33  |  |  * or base of numerical representation, e.g. base 2^64, base 2^32,  | 
34  |  |  * base 2^26. Complementary characteristic is how wide is the result of  | 
35  |  |  * multiplication of pair of digits, e.g. it would take 128 bits to  | 
36  |  |  * accommodate multiplication result in base 2^64 case. These are used  | 
37  |  |  * interchangeably. To describe implementation that is. But interface  | 
38  |  |  * is designed to isolate this so that low-level primitives implemented  | 
39  |  |  * in assembly can be self-contained/self-coherent.  | 
40  |  |  */  | 
41  |  | #ifndef POLY1305_ASM  | 
42  |  | /*  | 
43  |  |  * Even though there is __int128 reference implementation targeting  | 
44  |  |  * 64-bit platforms provided below, it's not obvious that it's optimal  | 
45  |  |  * choice for every one of them. Depending on instruction set overall  | 
46  |  |  * amount of instructions can be comparable to one in __int64  | 
47  |  |  * implementation. Amount of multiplication instructions would be lower,  | 
48  |  |  * but not necessarily overall. And in out-of-order execution context,  | 
49  |  |  * it is the latter that can be crucial...  | 
50  |  |  *  | 
51  |  |  * On related note. Poly1305 author, D. J. Bernstein, discusses and  | 
52  |  |  * provides floating-point implementations of the algorithm in question.  | 
53  |  |  * It made a lot of sense by the time of introduction, because most  | 
54  |  |  * then-modern processors didn't have pipelined integer multiplier.  | 
55  |  |  * [Not to mention that some had non-constant timing for integer  | 
56  |  |  * multiplications.] Floating-point instructions on the other hand could  | 
57  |  |  * be issued every cycle, which allowed to achieve better performance.  | 
58  |  |  * Nowadays, with SIMD and/or out-or-order execution, shared or  | 
59  |  |  * even emulated FPU, it's more complicated, and floating-point  | 
60  |  |  * implementation is not necessarily optimal choice in every situation,  | 
61  |  |  * rather contrary...  | 
62  |  |  *  | 
63  |  |  *                                              <appro@openssl.org>  | 
64  |  |  */  | 
65  |  |  | 
66  |  | typedef unsigned int u32;  | 
67  |  |  | 
68  |  | /*  | 
69  |  |  * poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks  | 
70  |  |  * of |inp| no longer than |len|. Behaviour for |len| not divisible by  | 
71  |  |  * block size is unspecified in general case, even though in reference  | 
72  |  |  * implementation the trailing chunk is simply ignored. Per algorithm  | 
73  |  |  * specification, every input block, complete or last partial, is to be  | 
74  |  |  * padded with a bit past most significant byte. The latter kind is then  | 
75  |  |  * padded with zeros till block size. This last partial block padding  | 
76  |  |  * is caller(*)'s responsibility, and because of this the last partial  | 
77  |  |  * block is always processed with separate call with |len| set to  | 
78  |  |  * POLY1305_BLOCK_SIZE and |padbit| to 0. In all other cases |padbit|  | 
79  |  |  * should be set to 1 to perform implicit padding with 128th bit.  | 
80  |  |  * poly1305_blocks does not actually check for this constraint though,  | 
81  |  |  * it's caller(*)'s responsibility to comply.  | 
82  |  |  *  | 
83  |  |  * (*)  In the context "caller" is not application code, but higher  | 
84  |  |  *      level Poly1305_* from this very module, so that quirks are  | 
85  |  |  *      handled locally.  | 
86  |  |  */  | 
87  |  | static void  | 
88  |  | poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit);  | 
89  |  |  | 
90  |  | /*  | 
91  |  |  * Type-agnostic "rip-off" from constant_time.h  | 
92  |  |  */  | 
93  |  | # define CONSTANT_TIME_CARRY(a,b) ( \  | 
94  |  |          (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \  | 
95  |  |          )  | 
96  |  |  | 
97  |  | # if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \  | 
98  |  |      (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8)  | 
99  |  |  | 
100  |  | typedef unsigned long u64;  | 
101  |  | typedef __uint128_t u128;  | 
102  |  |  | 
103  |  | typedef struct { | 
104  |  |     u64 h[3];  | 
105  |  |     u64 r[2];  | 
106  |  | } poly1305_internal;  | 
107  |  |  | 
108  |  | /* pick 32-bit unsigned integer in little endian order */  | 
109  |  | static u64 U8TOU64(const unsigned char *p)  | 
110  |  | { | 
111  |  |     return (((u64)(p[0] & 0xff)) |  | 
112  |  |             ((u64)(p[1] & 0xff) << 8) |  | 
113  |  |             ((u64)(p[2] & 0xff) << 16) |  | 
114  |  |             ((u64)(p[3] & 0xff) << 24) |  | 
115  |  |             ((u64)(p[4] & 0xff) << 32) |  | 
116  |  |             ((u64)(p[5] & 0xff) << 40) |  | 
117  |  |             ((u64)(p[6] & 0xff) << 48) |  | 
118  |  |             ((u64)(p[7] & 0xff) << 56));  | 
119  |  | }  | 
120  |  |  | 
121  |  | /* store a 32-bit unsigned integer in little endian */  | 
122  |  | static void U64TO8(unsigned char *p, u64 v)  | 
123  |  | { | 
124  |  |     p[0] = (unsigned char)((v) & 0xff);  | 
125  |  |     p[1] = (unsigned char)((v >> 8) & 0xff);  | 
126  |  |     p[2] = (unsigned char)((v >> 16) & 0xff);  | 
127  |  |     p[3] = (unsigned char)((v >> 24) & 0xff);  | 
128  |  |     p[4] = (unsigned char)((v >> 32) & 0xff);  | 
129  |  |     p[5] = (unsigned char)((v >> 40) & 0xff);  | 
130  |  |     p[6] = (unsigned char)((v >> 48) & 0xff);  | 
131  |  |     p[7] = (unsigned char)((v >> 56) & 0xff);  | 
132  |  | }  | 
133  |  |  | 
134  |  | static void poly1305_init(void *ctx, const unsigned char key[16])  | 
135  |  | { | 
136  |  |     poly1305_internal *st = (poly1305_internal *) ctx;  | 
137  |  |  | 
138  |  |     /* h = 0 */  | 
139  |  |     st->h[0] = 0;  | 
140  |  |     st->h[1] = 0;  | 
141  |  |     st->h[2] = 0;  | 
142  |  |  | 
143  |  |     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */  | 
144  |  |     st->r[0] = U8TOU64(&key[0]) & 0x0ffffffc0fffffff;  | 
145  |  |     st->r[1] = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc;  | 
146  |  | }  | 
147  |  |  | 
148  |  | static void  | 
149  |  | poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)  | 
150  |  | { | 
151  |  |     poly1305_internal *st = (poly1305_internal *)ctx;  | 
152  |  |     u64 r0, r1;  | 
153  |  |     u64 s1;  | 
154  |  |     u64 h0, h1, h2, c;  | 
155  |  |     u128 d0, d1;  | 
156  |  |  | 
157  |  |     r0 = st->r[0];  | 
158  |  |     r1 = st->r[1];  | 
159  |  |  | 
160  |  |     s1 = r1 + (r1 >> 2);  | 
161  |  |  | 
162  |  |     h0 = st->h[0];  | 
163  |  |     h1 = st->h[1];  | 
164  |  |     h2 = st->h[2];  | 
165  |  |  | 
166  |  |     while (len >= POLY1305_BLOCK_SIZE) { | 
167  |  |         /* h += m[i] */  | 
168  |  |         h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + 0));  | 
169  |  |         h1 = (u64)(d1 = (u128)h1 + (d0 >> 64) + U8TOU64(inp + 8));  | 
170  |  |         /*  | 
171  |  |          * padbit can be zero only when original len was  | 
172  |  |          * POLY1306_BLOCK_SIZE, but we don't check  | 
173  |  |          */  | 
174  |  |         h2 += (u64)(d1 >> 64) + padbit;  | 
175  |  |  | 
176  |  |         /* h *= r "%" p, where "%" stands for "partial remainder" */  | 
177  |  |         d0 = ((u128)h0 * r0) +  | 
178  |  |              ((u128)h1 * s1);  | 
179  |  |         d1 = ((u128)h0 * r1) +  | 
180  |  |              ((u128)h1 * r0) +  | 
181  |  |              (h2 * s1);  | 
182  |  |         h2 = (h2 * r0);  | 
183  |  |  | 
184  |  |         /* last reduction step: */  | 
185  |  |         /* a) h2:h0 = h2<<128 + d1<<64 + d0 */  | 
186  |  |         h0 = (u64)d0;  | 
187  |  |         h1 = (u64)(d1 += d0 >> 64);  | 
188  |  |         h2 += (u64)(d1 >> 64);  | 
189  |  |         /* b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 */  | 
190  |  |         c = (h2 >> 2) + (h2 & ~3UL);  | 
191  |  |         h2 &= 3;  | 
192  |  |         h0 += c;  | 
193  |  |         h1 += (c = CONSTANT_TIME_CARRY(h0,c));  | 
194  |  |         h2 += CONSTANT_TIME_CARRY(h1,c);  | 
195  |  |         /*  | 
196  |  |          * Occasional overflows to 3rd bit of h2 are taken care of  | 
197  |  |          * "naturally". If after this point we end up at the top of  | 
198  |  |          * this loop, then the overflow bit will be accounted for  | 
199  |  |          * in next iteration. If we end up in poly1305_emit, then  | 
200  |  |          * comparison to modulus below will still count as "carry  | 
201  |  |          * into 131st bit", so that properly reduced value will be  | 
202  |  |          * picked in conditional move.  | 
203  |  |          */  | 
204  |  |  | 
205  |  |         inp += POLY1305_BLOCK_SIZE;  | 
206  |  |         len -= POLY1305_BLOCK_SIZE;  | 
207  |  |     }  | 
208  |  |  | 
209  |  |     st->h[0] = h0;  | 
210  |  |     st->h[1] = h1;  | 
211  |  |     st->h[2] = h2;  | 
212  |  | }  | 
213  |  |  | 
214  |  | static void poly1305_emit(void *ctx, unsigned char mac[16],  | 
215  |  |                           const u32 nonce[4])  | 
216  |  | { | 
217  |  |     poly1305_internal *st = (poly1305_internal *) ctx;  | 
218  |  |     u64 h0, h1, h2;  | 
219  |  |     u64 g0, g1, g2;  | 
220  |  |     u128 t;  | 
221  |  |     u64 mask;  | 
222  |  |  | 
223  |  |     h0 = st->h[0];  | 
224  |  |     h1 = st->h[1];  | 
225  |  |     h2 = st->h[2];  | 
226  |  |  | 
227  |  |     /* compare to modulus by computing h + -p */  | 
228  |  |     g0 = (u64)(t = (u128)h0 + 5);  | 
229  |  |     g1 = (u64)(t = (u128)h1 + (t >> 64));  | 
230  |  |     g2 = h2 + (u64)(t >> 64);  | 
231  |  |  | 
232  |  |     /* if there was carry into 131st bit, h1:h0 = g1:g0 */  | 
233  |  |     mask = 0 - (g2 >> 2);  | 
234  |  |     g0 &= mask;  | 
235  |  |     g1 &= mask;  | 
236  |  |     mask = ~mask;  | 
237  |  |     h0 = (h0 & mask) | g0;  | 
238  |  |     h1 = (h1 & mask) | g1;  | 
239  |  |  | 
240  |  |     /* mac = (h + nonce) % (2^128) */  | 
241  |  |     h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32));  | 
242  |  |     h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64));  | 
243  |  |  | 
244  |  |     U64TO8(mac + 0, h0);  | 
245  |  |     U64TO8(mac + 8, h1);  | 
246  |  | }  | 
247  |  |  | 
248  |  | # else  | 
249  |  |  | 
250  |  | #  if defined(_WIN32) && !defined(__MINGW32__)  | 
251  |  | typedef unsigned __int64 u64;  | 
252  |  | #  elif defined(__arch64__)  | 
253  |  | typedef unsigned long u64;  | 
254  |  | #  else  | 
255  |  | typedef unsigned long long u64;  | 
256  |  | #  endif  | 
257  |  |  | 
258  |  | typedef struct { | 
259  |  |     u32 h[5];  | 
260  |  |     u32 r[4];  | 
261  |  | } poly1305_internal;  | 
262  |  |  | 
263  |  | /* store a 32-bit unsigned integer in little endian */  | 
264  |  | static void U32TO8(unsigned char *p, unsigned int v)  | 
265  |  | { | 
266  |  |     p[0] = (unsigned char)((v) & 0xff);  | 
267  |  |     p[1] = (unsigned char)((v >> 8) & 0xff);  | 
268  |  |     p[2] = (unsigned char)((v >> 16) & 0xff);  | 
269  |  |     p[3] = (unsigned char)((v >> 24) & 0xff);  | 
270  |  | }  | 
271  |  |  | 
272  |  | static void poly1305_init(void *ctx, const unsigned char key[16])  | 
273  |  | { | 
274  |  |     poly1305_internal *st = (poly1305_internal *) ctx;  | 
275  |  |  | 
276  |  |     /* h = 0 */  | 
277  |  |     st->h[0] = 0;  | 
278  |  |     st->h[1] = 0;  | 
279  |  |     st->h[2] = 0;  | 
280  |  |     st->h[3] = 0;  | 
281  |  |     st->h[4] = 0;  | 
282  |  |  | 
283  |  |     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */  | 
284  |  |     st->r[0] = U8TOU32(&key[0]) & 0x0fffffff;  | 
285  |  |     st->r[1] = U8TOU32(&key[4]) & 0x0ffffffc;  | 
286  |  |     st->r[2] = U8TOU32(&key[8]) & 0x0ffffffc;  | 
287  |  |     st->r[3] = U8TOU32(&key[12]) & 0x0ffffffc;  | 
288  |  | }  | 
289  |  |  | 
290  |  | static void  | 
291  |  | poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)  | 
292  |  | { | 
293  |  |     poly1305_internal *st = (poly1305_internal *)ctx;  | 
294  |  |     u32 r0, r1, r2, r3;  | 
295  |  |     u32 s1, s2, s3;  | 
296  |  |     u32 h0, h1, h2, h3, h4, c;  | 
297  |  |     u64 d0, d1, d2, d3;  | 
298  |  |  | 
299  |  |     r0 = st->r[0];  | 
300  |  |     r1 = st->r[1];  | 
301  |  |     r2 = st->r[2];  | 
302  |  |     r3 = st->r[3];  | 
303  |  |  | 
304  |  |     s1 = r1 + (r1 >> 2);  | 
305  |  |     s2 = r2 + (r2 >> 2);  | 
306  |  |     s3 = r3 + (r3 >> 2);  | 
307  |  |  | 
308  |  |     h0 = st->h[0];  | 
309  |  |     h1 = st->h[1];  | 
310  |  |     h2 = st->h[2];  | 
311  |  |     h3 = st->h[3];  | 
312  |  |     h4 = st->h[4];  | 
313  |  |  | 
314  |  |     while (len >= POLY1305_BLOCK_SIZE) { | 
315  |  |         /* h += m[i] */  | 
316  |  |         h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + 0));  | 
317  |  |         h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + U8TOU32(inp + 4));  | 
318  |  |         h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + U8TOU32(inp + 8));  | 
319  |  |         h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + U8TOU32(inp + 12));  | 
320  |  |         h4 += (u32)(d3 >> 32) + padbit;  | 
321  |  |  | 
322  |  |         /* h *= r "%" p, where "%" stands for "partial remainder" */  | 
323  |  |         d0 = ((u64)h0 * r0) +  | 
324  |  |              ((u64)h1 * s3) +  | 
325  |  |              ((u64)h2 * s2) +  | 
326  |  |              ((u64)h3 * s1);  | 
327  |  |         d1 = ((u64)h0 * r1) +  | 
328  |  |              ((u64)h1 * r0) +  | 
329  |  |              ((u64)h2 * s3) +  | 
330  |  |              ((u64)h3 * s2) +  | 
331  |  |              (h4 * s1);  | 
332  |  |         d2 = ((u64)h0 * r2) +  | 
333  |  |              ((u64)h1 * r1) +  | 
334  |  |              ((u64)h2 * r0) +  | 
335  |  |              ((u64)h3 * s3) +  | 
336  |  |              (h4 * s2);  | 
337  |  |         d3 = ((u64)h0 * r3) +  | 
338  |  |              ((u64)h1 * r2) +  | 
339  |  |              ((u64)h2 * r1) +  | 
340  |  |              ((u64)h3 * r0) +  | 
341  |  |              (h4 * s3);  | 
342  |  |         h4 = (h4 * r0);  | 
343  |  |  | 
344  |  |         /* last reduction step: */  | 
345  |  |         /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */  | 
346  |  |         h0 = (u32)d0;  | 
347  |  |         h1 = (u32)(d1 += d0 >> 32);  | 
348  |  |         h2 = (u32)(d2 += d1 >> 32);  | 
349  |  |         h3 = (u32)(d3 += d2 >> 32);  | 
350  |  |         h4 += (u32)(d3 >> 32);  | 
351  |  |         /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */  | 
352  |  |         c = (h4 >> 2) + (h4 & ~3U);  | 
353  |  |         h4 &= 3;  | 
354  |  |         h0 += c;  | 
355  |  |         h1 += (c = CONSTANT_TIME_CARRY(h0,c));  | 
356  |  |         h2 += (c = CONSTANT_TIME_CARRY(h1,c));  | 
357  |  |         h3 += (c = CONSTANT_TIME_CARRY(h2,c));  | 
358  |  |         h4 += CONSTANT_TIME_CARRY(h3,c);  | 
359  |  |         /*  | 
360  |  |          * Occasional overflows to 3rd bit of h4 are taken care of  | 
361  |  |          * "naturally". If after this point we end up at the top of  | 
362  |  |          * this loop, then the overflow bit will be accounted for  | 
363  |  |          * in next iteration. If we end up in poly1305_emit, then  | 
364  |  |          * comparison to modulus below will still count as "carry  | 
365  |  |          * into 131st bit", so that properly reduced value will be  | 
366  |  |          * picked in conditional move.  | 
367  |  |          */  | 
368  |  |  | 
369  |  |         inp += POLY1305_BLOCK_SIZE;  | 
370  |  |         len -= POLY1305_BLOCK_SIZE;  | 
371  |  |     }  | 
372  |  |  | 
373  |  |     st->h[0] = h0;  | 
374  |  |     st->h[1] = h1;  | 
375  |  |     st->h[2] = h2;  | 
376  |  |     st->h[3] = h3;  | 
377  |  |     st->h[4] = h4;  | 
378  |  | }  | 
379  |  |  | 
380  |  | static void poly1305_emit(void *ctx, unsigned char mac[16],  | 
381  |  |                           const u32 nonce[4])  | 
382  |  | { | 
383  |  |     poly1305_internal *st = (poly1305_internal *) ctx;  | 
384  |  |     u32 h0, h1, h2, h3, h4;  | 
385  |  |     u32 g0, g1, g2, g3, g4;  | 
386  |  |     u64 t;  | 
387  |  |     u32 mask;  | 
388  |  |  | 
389  |  |     h0 = st->h[0];  | 
390  |  |     h1 = st->h[1];  | 
391  |  |     h2 = st->h[2];  | 
392  |  |     h3 = st->h[3];  | 
393  |  |     h4 = st->h[4];  | 
394  |  |  | 
395  |  |     /* compare to modulus by computing h + -p */  | 
396  |  |     g0 = (u32)(t = (u64)h0 + 5);  | 
397  |  |     g1 = (u32)(t = (u64)h1 + (t >> 32));  | 
398  |  |     g2 = (u32)(t = (u64)h2 + (t >> 32));  | 
399  |  |     g3 = (u32)(t = (u64)h3 + (t >> 32));  | 
400  |  |     g4 = h4 + (u32)(t >> 32);  | 
401  |  |  | 
402  |  |     /* if there was carry into 131st bit, h3:h0 = g3:g0 */  | 
403  |  |     mask = 0 - (g4 >> 2);  | 
404  |  |     g0 &= mask;  | 
405  |  |     g1 &= mask;  | 
406  |  |     g2 &= mask;  | 
407  |  |     g3 &= mask;  | 
408  |  |     mask = ~mask;  | 
409  |  |     h0 = (h0 & mask) | g0;  | 
410  |  |     h1 = (h1 & mask) | g1;  | 
411  |  |     h2 = (h2 & mask) | g2;  | 
412  |  |     h3 = (h3 & mask) | g3;  | 
413  |  |  | 
414  |  |     /* mac = (h + nonce) % (2^128) */  | 
415  |  |     h0 = (u32)(t = (u64)h0 + nonce[0]);  | 
416  |  |     h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]);  | 
417  |  |     h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]);  | 
418  |  |     h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]);  | 
419  |  |  | 
420  |  |     U32TO8(mac + 0, h0);  | 
421  |  |     U32TO8(mac + 4, h1);  | 
422  |  |     U32TO8(mac + 8, h2);  | 
423  |  |     U32TO8(mac + 12, h3);  | 
424  |  | }  | 
425  |  | # endif  | 
426  |  | #else  | 
427  |  | int poly1305_init(void *ctx, const unsigned char key[16], void *func);  | 
428  |  | void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,  | 
429  |  |                      unsigned int padbit);  | 
430  |  | void poly1305_emit(void *ctx, unsigned char mac[16],  | 
431  |  |                    const unsigned int nonce[4]);  | 
432  |  | #endif  | 
433  |  |  | 
434  |  | void Poly1305_Init(POLY1305 *ctx, const unsigned char key[32])  | 
435  | 0  | { | 
436  | 0  |     ctx->nonce[0] = U8TOU32(&key[16]);  | 
437  | 0  |     ctx->nonce[1] = U8TOU32(&key[20]);  | 
438  | 0  |     ctx->nonce[2] = U8TOU32(&key[24]);  | 
439  | 0  |     ctx->nonce[3] = U8TOU32(&key[28]);  | 
440  |  | 
  | 
441  |  | #ifndef POLY1305_ASM  | 
442  |  |     poly1305_init(ctx->opaque, key);  | 
443  |  | #else  | 
444  |  |     /*  | 
445  |  |      * Unlike reference poly1305_init assembly counterpart is expected  | 
446  |  |      * to return a value: non-zero if it initializes ctx->func, and zero  | 
447  |  |      * otherwise. Latter is to simplify assembly in cases when there no  | 
448  |  |      * multiple code paths to switch between.  | 
449  |  |      */  | 
450  | 0  |     if (!poly1305_init(ctx->opaque, key, &ctx->func)) { | 
451  | 0  |         ctx->func.blocks = poly1305_blocks;  | 
452  | 0  |         ctx->func.emit = poly1305_emit;  | 
453  | 0  |     }  | 
454  | 0  | #endif  | 
455  |  | 
  | 
456  | 0  |     ctx->num = 0;  | 
457  |  | 
  | 
458  | 0  | }  | 
459  |  |  | 
460  |  | #ifdef POLY1305_ASM  | 
461  |  | /*  | 
462  |  |  * This "eclipses" poly1305_blocks and poly1305_emit, but it's  | 
463  |  |  * conscious choice imposed by -Wshadow compiler warnings.  | 
464  |  |  */  | 
465  | 0  | # define poly1305_blocks (*poly1305_blocks_p)  | 
466  | 0  | # define poly1305_emit   (*poly1305_emit_p)  | 
467  |  | #endif  | 
468  |  |  | 
469  |  | void Poly1305_Update(POLY1305 *ctx, const unsigned char *inp, size_t len)  | 
470  | 0  | { | 
471  | 0  | #ifdef POLY1305_ASM  | 
472  |  |     /*  | 
473  |  |      * As documented, poly1305_blocks is never called with input  | 
474  |  |      * longer than single block and padbit argument set to 0. This  | 
475  |  |      * property is fluently used in assembly modules to optimize  | 
476  |  |      * padbit handling on loop boundary.  | 
477  |  |      */  | 
478  | 0  |     poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks;  | 
479  | 0  | #endif  | 
480  | 0  |     size_t rem, num;  | 
481  |  | 
  | 
482  | 0  |     if ((num = ctx->num)) { | 
483  | 0  |         rem = POLY1305_BLOCK_SIZE - num;  | 
484  | 0  |         if (len >= rem) { | 
485  | 0  |             memcpy(ctx->data + num, inp, rem);  | 
486  | 0  |             poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1);  | 
487  | 0  |             inp += rem;  | 
488  | 0  |             len -= rem;  | 
489  | 0  |         } else { | 
490  |  |             /* Still not enough data to process a block. */  | 
491  | 0  |             memcpy(ctx->data + num, inp, len);  | 
492  | 0  |             ctx->num = num + len;  | 
493  | 0  |             return;  | 
494  | 0  |         }  | 
495  | 0  |     }  | 
496  |  |  | 
497  | 0  |     rem = len % POLY1305_BLOCK_SIZE;  | 
498  | 0  |     len -= rem;  | 
499  |  | 
  | 
500  | 0  |     if (len >= POLY1305_BLOCK_SIZE) { | 
501  | 0  |         poly1305_blocks(ctx->opaque, inp, len, 1);  | 
502  | 0  |         inp += len;  | 
503  | 0  |     }  | 
504  |  | 
  | 
505  | 0  |     if (rem)  | 
506  | 0  |         memcpy(ctx->data, inp, rem);  | 
507  |  | 
  | 
508  | 0  |     ctx->num = rem;  | 
509  | 0  | }  | 
510  |  |  | 
511  |  | void Poly1305_Final(POLY1305 *ctx, unsigned char mac[16])  | 
512  | 0  | { | 
513  | 0  | #ifdef POLY1305_ASM  | 
514  | 0  |     poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks;  | 
515  | 0  |     poly1305_emit_f poly1305_emit_p = ctx->func.emit;  | 
516  | 0  | #endif  | 
517  | 0  |     size_t num;  | 
518  |  | 
  | 
519  | 0  |     if ((num = ctx->num)) { | 
520  | 0  |         ctx->data[num++] = 1;   /* pad bit */  | 
521  | 0  |         while (num < POLY1305_BLOCK_SIZE)  | 
522  | 0  |             ctx->data[num++] = 0;  | 
523  | 0  |         poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0);  | 
524  | 0  |     }  | 
525  |  | 
  | 
526  | 0  |     poly1305_emit(ctx->opaque, mac, ctx->nonce);  | 
527  |  |  | 
528  |  |     /* zero out the state */  | 
529  | 0  |     OPENSSL_cleanse(ctx, sizeof(*ctx));  | 
530  | 0  | }  |