/src/boringssl/crypto/fipsmodule/modes/gcm_nohw.c.inc
Line | Count | Source |
1 | | /* Copyright (c) 2019, Google Inc. |
2 | | * |
3 | | * Permission to use, copy, modify, and/or distribute this software for any |
4 | | * purpose with or without fee is hereby granted, provided that the above |
5 | | * copyright notice and this permission notice appear in all copies. |
6 | | * |
7 | | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
8 | | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
9 | | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
10 | | * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
11 | | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
12 | | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
13 | | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
14 | | |
15 | | #include <openssl/base.h> |
16 | | |
17 | | #include "../../internal.h" |
18 | | #include "internal.h" |
19 | | |
20 | | #if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2) |
21 | | #include <emmintrin.h> |
22 | | #endif |
23 | | |
24 | | |
25 | | // This file contains a constant-time implementation of GHASH based on the notes |
26 | | // in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction |
27 | | // algorithm described in |
28 | | // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. |
29 | | // |
30 | | // Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our |
31 | | // primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run |
32 | | // the 32-bit implementation, but we can use its intrinsics if necessary. |
33 | | |
34 | | #if defined(BORINGSSL_HAS_UINT128) |
35 | | |
36 | | static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, |
37 | 27.8k | uint64_t b) { |
38 | | // One term every four bits means the largest term is 64/4 = 16, which barely |
39 | | // overflows into the next term. Using one term every five bits would cost 25 |
40 | | // multiplications instead of 16. It is faster to mask off the bottom four |
41 | | // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits |
42 | | // separately. |
43 | 27.8k | uint64_t a0 = a & UINT64_C(0x1111111111111110); |
44 | 27.8k | uint64_t a1 = a & UINT64_C(0x2222222222222220); |
45 | 27.8k | uint64_t a2 = a & UINT64_C(0x4444444444444440); |
46 | 27.8k | uint64_t a3 = a & UINT64_C(0x8888888888888880); |
47 | | |
48 | 27.8k | uint64_t b0 = b & UINT64_C(0x1111111111111111); |
49 | 27.8k | uint64_t b1 = b & UINT64_C(0x2222222222222222); |
50 | 27.8k | uint64_t b2 = b & UINT64_C(0x4444444444444444); |
51 | 27.8k | uint64_t b3 = b & UINT64_C(0x8888888888888888); |
52 | | |
53 | 27.8k | uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^ |
54 | 27.8k | (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1); |
55 | 27.8k | uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^ |
56 | 27.8k | (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2); |
57 | 27.8k | uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^ |
58 | 27.8k | (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3); |
59 | 27.8k | uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^ |
60 | 27.8k | (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0); |
61 | | |
62 | | // Multiply the bottom four bits of |a| with |b|. |
63 | 27.8k | uint64_t a0_mask = UINT64_C(0) - (a & 1); |
64 | 27.8k | uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1); |
65 | 27.8k | uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1); |
66 | 27.8k | uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1); |
67 | 27.8k | uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^ |
68 | 27.8k | ((uint128_t)(a2_mask & b) << 2) ^ |
69 | 27.8k | ((uint128_t)(a3_mask & b) << 3); |
70 | | |
71 | 27.8k | *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^ |
72 | 27.8k | (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^ |
73 | 27.8k | (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^ |
74 | 27.8k | (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra); |
75 | 27.8k | *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^ |
76 | 27.8k | (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^ |
77 | 27.8k | (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^ |
78 | 27.8k | (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^ |
79 | 27.8k | ((uint64_t)(extra >> 64)); |
80 | 27.8k | } |
81 | | |
82 | | #elif defined(OPENSSL_SSE2) |
83 | | |
84 | | static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) { |
85 | | // One term every four bits means the largest term is 32/4 = 8, which does not |
86 | | // overflow into the next term. |
87 | | __m128i aa = _mm_setr_epi32(a, 0, a, 0); |
88 | | __m128i bb = _mm_setr_epi32(b, 0, b, 0); |
89 | | |
90 | | __m128i a0a0 = |
91 | | _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0)); |
92 | | __m128i a2a2 = |
93 | | _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0)); |
94 | | __m128i b0b1 = |
95 | | _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0)); |
96 | | __m128i b2b3 = |
97 | | _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0)); |
98 | | |
99 | | __m128i c0c1 = |
100 | | _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3)); |
101 | | __m128i c2c3 = |
102 | | _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3)); |
103 | | |
104 | | __m128i a1a1 = |
105 | | _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0)); |
106 | | __m128i a3a3 = |
107 | | _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0)); |
108 | | __m128i b3b0 = |
109 | | _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0)); |
110 | | __m128i b1b2 = |
111 | | _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0)); |
112 | | |
113 | | c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0)); |
114 | | c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2)); |
115 | | c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0)); |
116 | | c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2)); |
117 | | |
118 | | c0c1 = _mm_and_si128( |
119 | | c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222)); |
120 | | c2c3 = _mm_and_si128( |
121 | | c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888)); |
122 | | |
123 | | c0c1 = _mm_xor_si128(c0c1, c2c3); |
124 | | // c0 ^= c1 |
125 | | c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8)); |
126 | | return c0c1; |
127 | | } |
128 | | |
129 | | static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, |
130 | | uint64_t b) { |
131 | | uint32_t a0 = a & 0xffffffff; |
132 | | uint32_t a1 = a >> 32; |
133 | | uint32_t b0 = b & 0xffffffff; |
134 | | uint32_t b1 = b >> 32; |
135 | | // Karatsuba multiplication. |
136 | | __m128i lo = gcm_mul32_nohw(a0, b0); |
137 | | __m128i hi = gcm_mul32_nohw(a1, b1); |
138 | | __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1); |
139 | | mid = _mm_xor_si128(mid, lo); |
140 | | mid = _mm_xor_si128(mid, hi); |
141 | | __m128i ret = _mm_unpacklo_epi64(lo, hi); |
142 | | mid = _mm_slli_si128(mid, 4); |
143 | | mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0)); |
144 | | ret = _mm_xor_si128(ret, mid); |
145 | | memcpy(out_lo, &ret, 8); |
146 | | memcpy(out_hi, ((char*)&ret) + 8, 8); |
147 | | } |
148 | | |
149 | | #else // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2 |
150 | | |
151 | | static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) { |
152 | | // One term every four bits means the largest term is 32/4 = 8, which does not |
153 | | // overflow into the next term. |
154 | | uint32_t a0 = a & 0x11111111; |
155 | | uint32_t a1 = a & 0x22222222; |
156 | | uint32_t a2 = a & 0x44444444; |
157 | | uint32_t a3 = a & 0x88888888; |
158 | | |
159 | | uint32_t b0 = b & 0x11111111; |
160 | | uint32_t b1 = b & 0x22222222; |
161 | | uint32_t b2 = b & 0x44444444; |
162 | | uint32_t b3 = b & 0x88888888; |
163 | | |
164 | | uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^ |
165 | | (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1); |
166 | | uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^ |
167 | | (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2); |
168 | | uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^ |
169 | | (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3); |
170 | | uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^ |
171 | | (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0); |
172 | | |
173 | | return (c0 & UINT64_C(0x1111111111111111)) | |
174 | | (c1 & UINT64_C(0x2222222222222222)) | |
175 | | (c2 & UINT64_C(0x4444444444444444)) | |
176 | | (c3 & UINT64_C(0x8888888888888888)); |
177 | | } |
178 | | |
179 | | static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, |
180 | | uint64_t b) { |
181 | | uint32_t a0 = a & 0xffffffff; |
182 | | uint32_t a1 = a >> 32; |
183 | | uint32_t b0 = b & 0xffffffff; |
184 | | uint32_t b1 = b >> 32; |
185 | | // Karatsuba multiplication. |
186 | | uint64_t lo = gcm_mul32_nohw(a0, b0); |
187 | | uint64_t hi = gcm_mul32_nohw(a1, b1); |
188 | | uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi; |
189 | | *out_lo = lo ^ (mid << 32); |
190 | | *out_hi = hi ^ (mid >> 32); |
191 | | } |
192 | | |
193 | | #endif // BORINGSSL_HAS_UINT128 |
194 | | |
195 | 694 | void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) { |
196 | | // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This |
197 | | // avoids a shift by 1 in the multiplication, needed to account for bit |
198 | | // reversal losing a bit after multiplication, that is, |
199 | | // rev128(X) * rev128(Y) = rev255(X*Y). |
200 | | // |
201 | | // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation |
202 | | // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped. |
203 | | // |
204 | | // See also slide 16 of |
205 | | // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf |
206 | 694 | Htable[0].lo = Xi[1]; |
207 | 694 | Htable[0].hi = Xi[0]; |
208 | | |
209 | 694 | uint64_t carry = Htable[0].hi >> 63; |
210 | 694 | carry = 0u - carry; |
211 | | |
212 | 694 | Htable[0].hi <<= 1; |
213 | 694 | Htable[0].hi |= Htable[0].lo >> 63; |
214 | 694 | Htable[0].lo <<= 1; |
215 | | |
216 | | // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we |
217 | | // conditionally add 0xc200...0001. |
218 | 694 | Htable[0].lo ^= carry & 1; |
219 | 694 | Htable[0].hi ^= carry & UINT64_C(0xc200000000000000); |
220 | | |
221 | | // This implementation does not use the rest of |Htable|. |
222 | 694 | } |
223 | | |
224 | 9.27k | static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) { |
225 | | // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0| |
226 | | // through |r3|. Note there is no byte or bit reversal because we are |
227 | | // evaluating POLYVAL. |
228 | 9.27k | uint64_t r0, r1; |
229 | 9.27k | gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo); |
230 | 9.27k | uint64_t r2, r3; |
231 | 9.27k | gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi); |
232 | 9.27k | uint64_t mid0, mid1; |
233 | 9.27k | gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo); |
234 | 9.27k | mid0 ^= r0 ^ r2; |
235 | 9.27k | mid1 ^= r1 ^ r3; |
236 | 9.27k | r2 ^= mid1; |
237 | 9.27k | r1 ^= mid0; |
238 | | |
239 | | // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and |
240 | | // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We |
241 | | // have: |
242 | | // |
243 | | // 1 = x^121 + x^126 + x^127 + x^128 |
244 | | // x^-128 = x^-7 + x^-2 + x^-1 + 1 |
245 | | // |
246 | | // This is the GHASH reduction step, but with bits flowing in reverse. |
247 | | |
248 | | // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require |
249 | | // another reduction steps. Instead, we gather the excess bits, incorporate |
250 | | // them into |r0| and |r1| and reduce once. See slides 17-19 |
251 | | // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. |
252 | 9.27k | r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57); |
253 | | |
254 | | // 1 |
255 | 9.27k | r2 ^= r0; |
256 | 9.27k | r3 ^= r1; |
257 | | |
258 | | // x^-1 |
259 | 9.27k | r2 ^= r0 >> 1; |
260 | 9.27k | r2 ^= r1 << 63; |
261 | 9.27k | r3 ^= r1 >> 1; |
262 | | |
263 | | // x^-2 |
264 | 9.27k | r2 ^= r0 >> 2; |
265 | 9.27k | r2 ^= r1 << 62; |
266 | 9.27k | r3 ^= r1 >> 2; |
267 | | |
268 | | // x^-7 |
269 | 9.27k | r2 ^= r0 >> 7; |
270 | 9.27k | r2 ^= r1 << 57; |
271 | 9.27k | r3 ^= r1 >> 7; |
272 | | |
273 | 9.27k | Xi[0] = r2; |
274 | 9.27k | Xi[1] = r3; |
275 | 9.27k | } |
276 | | |
277 | 3.83k | void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) { |
278 | 3.83k | uint64_t swapped[2]; |
279 | 3.83k | swapped[0] = CRYPTO_load_u64_be(Xi + 8); |
280 | 3.83k | swapped[1] = CRYPTO_load_u64_be(Xi); |
281 | 3.83k | gcm_polyval_nohw(swapped, &Htable[0]); |
282 | 3.83k | CRYPTO_store_u64_be(Xi, swapped[1]); |
283 | 3.83k | CRYPTO_store_u64_be(Xi + 8, swapped[0]); |
284 | 3.83k | } |
285 | | |
286 | | void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, |
287 | 662 | size_t len) { |
288 | 662 | uint64_t swapped[2]; |
289 | 662 | swapped[0] = CRYPTO_load_u64_be(Xi + 8); |
290 | 662 | swapped[1] = CRYPTO_load_u64_be(Xi); |
291 | | |
292 | 6.10k | while (len >= 16) { |
293 | 5.44k | swapped[0] ^= CRYPTO_load_u64_be(inp + 8); |
294 | 5.44k | swapped[1] ^= CRYPTO_load_u64_be(inp); |
295 | 5.44k | gcm_polyval_nohw(swapped, &Htable[0]); |
296 | 5.44k | inp += 16; |
297 | 5.44k | len -= 16; |
298 | 5.44k | } |
299 | | |
300 | 662 | CRYPTO_store_u64_be(Xi, swapped[1]); |
301 | 662 | CRYPTO_store_u64_be(Xi + 8, swapped[0]); |
302 | 662 | } |