Coverage Report

Created: 2024-11-21 07:03

/src/boringssl/crypto/fipsmodule/modes/gcm_nohw.c.inc
Line
Count
Source
1
/* Copyright (c) 2019, Google Inc.
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15
#include <openssl/base.h>
16
17
#include "../../internal.h"
18
#include "internal.h"
19
20
#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2)
21
#include <emmintrin.h>
22
#endif
23
24
25
// This file contains a constant-time implementation of GHASH based on the notes
26
// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
27
// algorithm described in
28
// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
29
//
30
// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our
31
// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run
32
// the 32-bit implementation, but we can use its intrinsics if necessary.
33
34
#if defined(BORINGSSL_HAS_UINT128)
35
36
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
37
27.8k
                           uint64_t b) {
38
  // One term every four bits means the largest term is 64/4 = 16, which barely
39
  // overflows into the next term. Using one term every five bits would cost 25
40
  // multiplications instead of 16. It is faster to mask off the bottom four
41
  // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits
42
  // separately.
43
27.8k
  uint64_t a0 = a & UINT64_C(0x1111111111111110);
44
27.8k
  uint64_t a1 = a & UINT64_C(0x2222222222222220);
45
27.8k
  uint64_t a2 = a & UINT64_C(0x4444444444444440);
46
27.8k
  uint64_t a3 = a & UINT64_C(0x8888888888888880);
47
48
27.8k
  uint64_t b0 = b & UINT64_C(0x1111111111111111);
49
27.8k
  uint64_t b1 = b & UINT64_C(0x2222222222222222);
50
27.8k
  uint64_t b2 = b & UINT64_C(0x4444444444444444);
51
27.8k
  uint64_t b3 = b & UINT64_C(0x8888888888888888);
52
53
27.8k
  uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^
54
27.8k
                 (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1);
55
27.8k
  uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^
56
27.8k
                 (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2);
57
27.8k
  uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^
58
27.8k
                 (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3);
59
27.8k
  uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^
60
27.8k
                 (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0);
61
62
  // Multiply the bottom four bits of |a| with |b|.
63
27.8k
  uint64_t a0_mask = UINT64_C(0) - (a & 1);
64
27.8k
  uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1);
65
27.8k
  uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1);
66
27.8k
  uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1);
67
27.8k
  uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^
68
27.8k
                    ((uint128_t)(a2_mask & b) << 2) ^
69
27.8k
                    ((uint128_t)(a3_mask & b) << 3);
70
71
27.8k
  *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^
72
27.8k
            (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^
73
27.8k
            (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^
74
27.8k
            (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra);
75
27.8k
  *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^
76
27.8k
            (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^
77
27.8k
            (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^
78
27.8k
            (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^
79
27.8k
            ((uint64_t)(extra >> 64));
80
27.8k
}
81
82
#elif defined(OPENSSL_SSE2)
83
84
static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) {
85
  // One term every four bits means the largest term is 32/4 = 8, which does not
86
  // overflow into the next term.
87
  __m128i aa = _mm_setr_epi32(a, 0, a, 0);
88
  __m128i bb = _mm_setr_epi32(b, 0, b, 0);
89
90
  __m128i a0a0 =
91
      _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0));
92
  __m128i a2a2 =
93
      _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0));
94
  __m128i b0b1 =
95
      _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0));
96
  __m128i b2b3 =
97
      _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0));
98
99
  __m128i c0c1 =
100
      _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3));
101
  __m128i c2c3 =
102
      _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3));
103
104
  __m128i a1a1 =
105
      _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0));
106
  __m128i a3a3 =
107
      _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0));
108
  __m128i b3b0 =
109
      _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0));
110
  __m128i b1b2 =
111
      _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0));
112
113
  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0));
114
  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2));
115
  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0));
116
  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2));
117
118
  c0c1 = _mm_and_si128(
119
      c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222));
120
  c2c3 = _mm_and_si128(
121
      c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888));
122
123
  c0c1 = _mm_xor_si128(c0c1, c2c3);
124
  // c0 ^= c1
125
  c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8));
126
  return c0c1;
127
}
128
129
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
130
                           uint64_t b) {
131
  uint32_t a0 = a & 0xffffffff;
132
  uint32_t a1 = a >> 32;
133
  uint32_t b0 = b & 0xffffffff;
134
  uint32_t b1 = b >> 32;
135
  // Karatsuba multiplication.
136
  __m128i lo = gcm_mul32_nohw(a0, b0);
137
  __m128i hi = gcm_mul32_nohw(a1, b1);
138
  __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1);
139
  mid = _mm_xor_si128(mid, lo);
140
  mid = _mm_xor_si128(mid, hi);
141
  __m128i ret = _mm_unpacklo_epi64(lo, hi);
142
  mid = _mm_slli_si128(mid, 4);
143
  mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0));
144
  ret = _mm_xor_si128(ret, mid);
145
  memcpy(out_lo, &ret, 8);
146
  memcpy(out_hi, ((char*)&ret) + 8, 8);
147
}
148
149
#else  // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2
150
151
static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) {
152
  // One term every four bits means the largest term is 32/4 = 8, which does not
153
  // overflow into the next term.
154
  uint32_t a0 = a & 0x11111111;
155
  uint32_t a1 = a & 0x22222222;
156
  uint32_t a2 = a & 0x44444444;
157
  uint32_t a3 = a & 0x88888888;
158
159
  uint32_t b0 = b & 0x11111111;
160
  uint32_t b1 = b & 0x22222222;
161
  uint32_t b2 = b & 0x44444444;
162
  uint32_t b3 = b & 0x88888888;
163
164
  uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^
165
                (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1);
166
  uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^
167
                (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2);
168
  uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^
169
                (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3);
170
  uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^
171
                (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0);
172
173
  return (c0 & UINT64_C(0x1111111111111111)) |
174
         (c1 & UINT64_C(0x2222222222222222)) |
175
         (c2 & UINT64_C(0x4444444444444444)) |
176
         (c3 & UINT64_C(0x8888888888888888));
177
}
178
179
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
180
                           uint64_t b) {
181
  uint32_t a0 = a & 0xffffffff;
182
  uint32_t a1 = a >> 32;
183
  uint32_t b0 = b & 0xffffffff;
184
  uint32_t b1 = b >> 32;
185
  // Karatsuba multiplication.
186
  uint64_t lo = gcm_mul32_nohw(a0, b0);
187
  uint64_t hi = gcm_mul32_nohw(a1, b1);
188
  uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
189
  *out_lo = lo ^ (mid << 32);
190
  *out_hi = hi ^ (mid >> 32);
191
}
192
193
#endif  // BORINGSSL_HAS_UINT128
194
195
694
void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) {
196
  // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
197
  // avoids a shift by 1 in the multiplication, needed to account for bit
198
  // reversal losing a bit after multiplication, that is,
199
  // rev128(X) * rev128(Y) = rev255(X*Y).
200
  //
201
  // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation
202
  // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped.
203
  //
204
  // See also slide 16 of
205
  // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
206
694
  Htable[0].lo = Xi[1];
207
694
  Htable[0].hi = Xi[0];
208
209
694
  uint64_t carry = Htable[0].hi >> 63;
210
694
  carry = 0u - carry;
211
212
694
  Htable[0].hi <<= 1;
213
694
  Htable[0].hi |= Htable[0].lo >> 63;
214
694
  Htable[0].lo <<= 1;
215
216
  // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we
217
  // conditionally add 0xc200...0001.
218
694
  Htable[0].lo ^= carry & 1;
219
694
  Htable[0].hi ^= carry & UINT64_C(0xc200000000000000);
220
221
  // This implementation does not use the rest of |Htable|.
222
694
}
223
224
9.27k
static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) {
225
  // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
226
  // through |r3|. Note there is no byte or bit reversal because we are
227
  // evaluating POLYVAL.
228
9.27k
  uint64_t r0, r1;
229
9.27k
  gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo);
230
9.27k
  uint64_t r2, r3;
231
9.27k
  gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi);
232
9.27k
  uint64_t mid0, mid1;
233
9.27k
  gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo);
234
9.27k
  mid0 ^= r0 ^ r2;
235
9.27k
  mid1 ^= r1 ^ r3;
236
9.27k
  r2 ^= mid1;
237
9.27k
  r1 ^= mid0;
238
239
  // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and
240
  // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We
241
  // have:
242
  //
243
  //       1 = x^121 + x^126 + x^127 + x^128
244
  //  x^-128 = x^-7 + x^-2 + x^-1 + 1
245
  //
246
  // This is the GHASH reduction step, but with bits flowing in reverse.
247
248
  // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require
249
  // another reduction steps. Instead, we gather the excess bits, incorporate
250
  // them into |r0| and |r1| and reduce once. See slides 17-19
251
  // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
252
9.27k
  r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57);
253
254
  // 1
255
9.27k
  r2 ^= r0;
256
9.27k
  r3 ^= r1;
257
258
  // x^-1
259
9.27k
  r2 ^= r0 >> 1;
260
9.27k
  r2 ^= r1 << 63;
261
9.27k
  r3 ^= r1 >> 1;
262
263
  // x^-2
264
9.27k
  r2 ^= r0 >> 2;
265
9.27k
  r2 ^= r1 << 62;
266
9.27k
  r3 ^= r1 >> 2;
267
268
  // x^-7
269
9.27k
  r2 ^= r0 >> 7;
270
9.27k
  r2 ^= r1 << 57;
271
9.27k
  r3 ^= r1 >> 7;
272
273
9.27k
  Xi[0] = r2;
274
9.27k
  Xi[1] = r3;
275
9.27k
}
276
277
3.83k
void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) {
278
3.83k
  uint64_t swapped[2];
279
3.83k
  swapped[0] = CRYPTO_load_u64_be(Xi + 8);
280
3.83k
  swapped[1] = CRYPTO_load_u64_be(Xi);
281
3.83k
  gcm_polyval_nohw(swapped, &Htable[0]);
282
3.83k
  CRYPTO_store_u64_be(Xi, swapped[1]);
283
3.83k
  CRYPTO_store_u64_be(Xi + 8, swapped[0]);
284
3.83k
}
285
286
void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
287
662
                    size_t len) {
288
662
  uint64_t swapped[2];
289
662
  swapped[0] = CRYPTO_load_u64_be(Xi + 8);
290
662
  swapped[1] = CRYPTO_load_u64_be(Xi);
291
292
6.10k
  while (len >= 16) {
293
5.44k
    swapped[0] ^= CRYPTO_load_u64_be(inp + 8);
294
5.44k
    swapped[1] ^= CRYPTO_load_u64_be(inp);
295
5.44k
    gcm_polyval_nohw(swapped, &Htable[0]);
296
5.44k
    inp += 16;
297
5.44k
    len -= 16;
298
5.44k
  }
299
300
662
  CRYPTO_store_u64_be(Xi, swapped[1]);
301
662
  CRYPTO_store_u64_be(Xi + 8, swapped[0]);
302
662
}