/src/BearSSL/src/hash/ghash_ctmul32.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> |
3 | | * |
4 | | * Permission is hereby granted, free of charge, to any person obtaining |
5 | | * a copy of this software and associated documentation files (the |
6 | | * "Software"), to deal in the Software without restriction, including |
7 | | * without limitation the rights to use, copy, modify, merge, publish, |
8 | | * distribute, sublicense, and/or sell copies of the Software, and to |
9 | | * permit persons to whom the Software is furnished to do so, subject to |
10 | | * the following conditions: |
11 | | * |
12 | | * The above copyright notice and this permission notice shall be |
13 | | * included in all copies or substantial portions of the Software. |
14 | | * |
15 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
16 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
17 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
18 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
19 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
20 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
21 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | |
25 | | #include "inner.h" |
26 | | |
27 | | /* |
28 | | * This implementation uses 32-bit multiplications, and only the low |
29 | | * 32 bits for each multiplication result. This is meant primarily for |
30 | | * the ARM Cortex M0 and M0+, whose multiplication opcode does not yield |
31 | | * the upper 32 bits; but it might also be useful on architectures where |
32 | | * access to the upper 32 bits requires use of specific registers that |
33 | | * create contention (e.g. on i386, "mul" necessarily outputs the result |
34 | | * in edx:eax, while "imul" can use any registers but is limited to the |
35 | | * low 32 bits). |
36 | | * |
37 | | * The implementation trick that is used here is bit-reversing (bit 0 |
38 | | * is swapped with bit 31, bit 1 with bit 30, and so on). In GF(2)[X], |
39 | | * for all values x and y, we have: |
40 | | * rev32(x) * rev32(y) = rev64(x * y) |
41 | | * In other words, if we bit-reverse (over 32 bits) the operands, then we |
42 | | * bit-reverse (over 64 bits) the result. |
43 | | */ |
44 | | |
45 | | /* |
46 | | * Multiplication in GF(2)[X], truncated to its low 32 bits. |
47 | | */ |
48 | | static inline uint32_t |
49 | | bmul32(uint32_t x, uint32_t y) |
50 | 1.62M | { |
51 | 1.62M | uint32_t x0, x1, x2, x3; |
52 | 1.62M | uint32_t y0, y1, y2, y3; |
53 | 1.62M | uint32_t z0, z1, z2, z3; |
54 | | |
55 | 1.62M | x0 = x & (uint32_t)0x11111111; |
56 | 1.62M | x1 = x & (uint32_t)0x22222222; |
57 | 1.62M | x2 = x & (uint32_t)0x44444444; |
58 | 1.62M | x3 = x & (uint32_t)0x88888888; |
59 | 1.62M | y0 = y & (uint32_t)0x11111111; |
60 | 1.62M | y1 = y & (uint32_t)0x22222222; |
61 | 1.62M | y2 = y & (uint32_t)0x44444444; |
62 | 1.62M | y3 = y & (uint32_t)0x88888888; |
63 | 1.62M | z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1); |
64 | 1.62M | z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2); |
65 | 1.62M | z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3); |
66 | 1.62M | z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0); |
67 | 1.62M | z0 &= (uint32_t)0x11111111; |
68 | 1.62M | z1 &= (uint32_t)0x22222222; |
69 | 1.62M | z2 &= (uint32_t)0x44444444; |
70 | 1.62M | z3 &= (uint32_t)0x88888888; |
71 | 1.62M | return z0 | z1 | z2 | z3; |
72 | 1.62M | } |
73 | | |
74 | | /* |
75 | | * Bit-reverse a 32-bit word. |
76 | | */ |
77 | | static uint32_t |
78 | | rev32(uint32_t x) |
79 | 1.06M | { |
80 | 4.24M | #define RMS(m, s) do { \ |
81 | 4.24M | x = ((x & (uint32_t)(m)) << (s)) \ |
82 | 4.24M | | ((x >> (s)) & (uint32_t)(m)); \ |
83 | 4.24M | } while (0) |
84 | | |
85 | 1.06M | RMS(0x55555555, 1); |
86 | 1.06M | RMS(0x33333333, 2); |
87 | 1.06M | RMS(0x0F0F0F0F, 4); |
88 | 1.06M | RMS(0x00FF00FF, 8); |
89 | 1.06M | return (x << 16) | (x >> 16); |
90 | | |
91 | 1.06M | #undef RMS |
92 | 1.06M | } |
93 | | |
94 | | /* see bearssl_hash.h */ |
95 | | void |
96 | | br_ghash_ctmul32(void *y, const void *h, const void *data, size_t len) |
97 | 16.6k | { |
98 | | /* |
99 | | * This implementation is similar to br_ghash_ctmul() except |
100 | | * that we have to do the multiplication twice, with the |
101 | | * "normal" and "bit reversed" operands. Hence we end up with |
102 | | * eighteen 32-bit multiplications instead of nine. |
103 | | */ |
104 | | |
105 | 16.6k | const unsigned char *buf, *hb; |
106 | 16.6k | unsigned char *yb; |
107 | 16.6k | uint32_t yw[4]; |
108 | 16.6k | uint32_t hw[4], hwr[4]; |
109 | | |
110 | 16.6k | buf = data; |
111 | 16.6k | yb = y; |
112 | 16.6k | hb = h; |
113 | 16.6k | yw[3] = br_dec32be(yb); |
114 | 16.6k | yw[2] = br_dec32be(yb + 4); |
115 | 16.6k | yw[1] = br_dec32be(yb + 8); |
116 | 16.6k | yw[0] = br_dec32be(yb + 12); |
117 | 16.6k | hw[3] = br_dec32be(hb); |
118 | 16.6k | hw[2] = br_dec32be(hb + 4); |
119 | 16.6k | hw[1] = br_dec32be(hb + 8); |
120 | 16.6k | hw[0] = br_dec32be(hb + 12); |
121 | 16.6k | hwr[3] = rev32(hw[3]); |
122 | 16.6k | hwr[2] = rev32(hw[2]); |
123 | 16.6k | hwr[1] = rev32(hw[1]); |
124 | 16.6k | hwr[0] = rev32(hw[0]); |
125 | 107k | while (len > 0) { |
126 | 90.4k | const unsigned char *src; |
127 | 90.4k | unsigned char tmp[16]; |
128 | 90.4k | int i; |
129 | 90.4k | uint32_t a[18], b[18], c[18]; |
130 | 90.4k | uint32_t d0, d1, d2, d3, d4, d5, d6, d7; |
131 | 90.4k | uint32_t zw[8]; |
132 | | |
133 | 90.4k | if (len >= 16) { |
134 | 88.4k | src = buf; |
135 | 88.4k | buf += 16; |
136 | 88.4k | len -= 16; |
137 | 88.4k | } else { |
138 | 2.05k | memcpy(tmp, buf, len); |
139 | 2.05k | memset(tmp + len, 0, (sizeof tmp) - len); |
140 | 2.05k | src = tmp; |
141 | 2.05k | len = 0; |
142 | 2.05k | } |
143 | 90.4k | yw[3] ^= br_dec32be(src); |
144 | 90.4k | yw[2] ^= br_dec32be(src + 4); |
145 | 90.4k | yw[1] ^= br_dec32be(src + 8); |
146 | 90.4k | yw[0] ^= br_dec32be(src + 12); |
147 | | |
148 | | /* |
149 | | * We are using Karatsuba: the 128x128 multiplication is |
150 | | * reduced to three 64x64 multiplications, hence nine |
151 | | * 32x32 multiplications. With the bit-reversal trick, |
152 | | * we have to perform 18 32x32 multiplications. |
153 | | */ |
154 | | |
155 | | /* |
156 | | * y[0,1]*h[0,1] -> 0,1,4 |
157 | | * y[2,3]*h[2,3] -> 2,3,5 |
158 | | * (y[0,1]+y[2,3])*(h[0,1]+h[2,3]) -> 6,7,8 |
159 | | */ |
160 | | |
161 | 90.4k | a[0] = yw[0]; |
162 | 90.4k | a[1] = yw[1]; |
163 | 90.4k | a[2] = yw[2]; |
164 | 90.4k | a[3] = yw[3]; |
165 | 90.4k | a[4] = a[0] ^ a[1]; |
166 | 90.4k | a[5] = a[2] ^ a[3]; |
167 | 90.4k | a[6] = a[0] ^ a[2]; |
168 | 90.4k | a[7] = a[1] ^ a[3]; |
169 | 90.4k | a[8] = a[6] ^ a[7]; |
170 | | |
171 | 90.4k | a[ 9] = rev32(yw[0]); |
172 | 90.4k | a[10] = rev32(yw[1]); |
173 | 90.4k | a[11] = rev32(yw[2]); |
174 | 90.4k | a[12] = rev32(yw[3]); |
175 | 90.4k | a[13] = a[ 9] ^ a[10]; |
176 | 90.4k | a[14] = a[11] ^ a[12]; |
177 | 90.4k | a[15] = a[ 9] ^ a[11]; |
178 | 90.4k | a[16] = a[10] ^ a[12]; |
179 | 90.4k | a[17] = a[15] ^ a[16]; |
180 | | |
181 | 90.4k | b[0] = hw[0]; |
182 | 90.4k | b[1] = hw[1]; |
183 | 90.4k | b[2] = hw[2]; |
184 | 90.4k | b[3] = hw[3]; |
185 | 90.4k | b[4] = b[0] ^ b[1]; |
186 | 90.4k | b[5] = b[2] ^ b[3]; |
187 | 90.4k | b[6] = b[0] ^ b[2]; |
188 | 90.4k | b[7] = b[1] ^ b[3]; |
189 | 90.4k | b[8] = b[6] ^ b[7]; |
190 | | |
191 | 90.4k | b[ 9] = hwr[0]; |
192 | 90.4k | b[10] = hwr[1]; |
193 | 90.4k | b[11] = hwr[2]; |
194 | 90.4k | b[12] = hwr[3]; |
195 | 90.4k | b[13] = b[ 9] ^ b[10]; |
196 | 90.4k | b[14] = b[11] ^ b[12]; |
197 | 90.4k | b[15] = b[ 9] ^ b[11]; |
198 | 90.4k | b[16] = b[10] ^ b[12]; |
199 | 90.4k | b[17] = b[15] ^ b[16]; |
200 | | |
201 | 1.71M | for (i = 0; i < 18; i ++) { |
202 | 1.62M | c[i] = bmul32(a[i], b[i]); |
203 | 1.62M | } |
204 | | |
205 | 90.4k | c[4] ^= c[0] ^ c[1]; |
206 | 90.4k | c[5] ^= c[2] ^ c[3]; |
207 | 90.4k | c[8] ^= c[6] ^ c[7]; |
208 | | |
209 | 90.4k | c[13] ^= c[ 9] ^ c[10]; |
210 | 90.4k | c[14] ^= c[11] ^ c[12]; |
211 | 90.4k | c[17] ^= c[15] ^ c[16]; |
212 | | |
213 | | /* |
214 | | * y[0,1]*h[0,1] -> 0,9^4,1^13,10 |
215 | | * y[2,3]*h[2,3] -> 2,11^5,3^14,12 |
216 | | * (y[0,1]+y[2,3])*(h[0,1]+h[2,3]) -> 6,15^8,7^17,16 |
217 | | */ |
218 | 90.4k | d0 = c[0]; |
219 | 90.4k | d1 = c[4] ^ (rev32(c[9]) >> 1); |
220 | 90.4k | d2 = c[1] ^ c[0] ^ c[2] ^ c[6] ^ (rev32(c[13]) >> 1); |
221 | 90.4k | d3 = c[4] ^ c[5] ^ c[8] |
222 | 90.4k | ^ (rev32(c[10] ^ c[9] ^ c[11] ^ c[15]) >> 1); |
223 | 90.4k | d4 = c[2] ^ c[1] ^ c[3] ^ c[7] |
224 | 90.4k | ^ (rev32(c[13] ^ c[14] ^ c[17]) >> 1); |
225 | 90.4k | d5 = c[5] ^ (rev32(c[11] ^ c[10] ^ c[12] ^ c[16]) >> 1); |
226 | 90.4k | d6 = c[3] ^ (rev32(c[14]) >> 1); |
227 | 90.4k | d7 = rev32(c[12]) >> 1; |
228 | | |
229 | 90.4k | zw[0] = d0 << 1; |
230 | 90.4k | zw[1] = (d1 << 1) | (d0 >> 31); |
231 | 90.4k | zw[2] = (d2 << 1) | (d1 >> 31); |
232 | 90.4k | zw[3] = (d3 << 1) | (d2 >> 31); |
233 | 90.4k | zw[4] = (d4 << 1) | (d3 >> 31); |
234 | 90.4k | zw[5] = (d5 << 1) | (d4 >> 31); |
235 | 90.4k | zw[6] = (d6 << 1) | (d5 >> 31); |
236 | 90.4k | zw[7] = (d7 << 1) | (d6 >> 31); |
237 | | |
238 | 452k | for (i = 0; i < 4; i ++) { |
239 | 361k | uint32_t lw; |
240 | | |
241 | 361k | lw = zw[i]; |
242 | 361k | zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7); |
243 | 361k | zw[i + 3] ^= (lw << 31) ^ (lw << 30) ^ (lw << 25); |
244 | 361k | } |
245 | 90.4k | memcpy(yw, zw + 4, sizeof yw); |
246 | 90.4k | } |
247 | 16.6k | br_enc32be(yb, yw[3]); |
248 | 16.6k | br_enc32be(yb + 4, yw[2]); |
249 | 16.6k | br_enc32be(yb + 8, yw[1]); |
250 | 16.6k | br_enc32be(yb + 12, yw[0]); |
251 | 16.6k | } |