Coverage Report

Created: 2020-08-01 06:18

/src/botan/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp
Line
Count
Source
1
/*
2
* Hook for CLMUL/PMULL/VPMSUM
3
* (C) 2013,2017,2019,2020 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/internal/clmul_cpu.h>
9
#include <botan/internal/simd_32.h>
10
11
#if defined(BOTAN_SIMD_USE_SSE2)
12
  #include <immintrin.h>
13
  #include <wmmintrin.h>
14
#endif
15
16
namespace Botan {
17
18
namespace {
19
20
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in)
21
42.4k
   {
22
42.4k
#if defined(BOTAN_SIMD_USE_SSE2)
23
42.4k
   const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
24
42.4k
   return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
25
#elif defined(BOTAN_SIMD_USE_NEON)
26
   const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
27
   const uint8x16_t mask = vld1q_u8(maskb);
28
   return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
29
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
30
   const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
31
   return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask));
32
#endif
33
42.4k
   }
34
35
template<int M>
36
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x)
37
115k
   {
38
115k
   static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
39
115k
40
115k
#if defined(BOTAN_SIMD_USE_SSE2)
41
115k
   return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
42
#elif defined(BOTAN_SIMD_USE_NEON)
43
   const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
44
   const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
45
   return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
46
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
47
   const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
48
49
   SIMD_4x32 i1 = x;
50
   SIMD_4x32 i2 = H;
51
52
   if(M == 0x11)
53
      {
54
      i1 &= mask_lo;
55
      i2 &= mask_lo;
56
      }
57
   else if(M == 0x10)
58
      {
59
      i1 = i1.shift_elems_left<2>();
60
      }
61
   else if(M == 0x01)
62
      {
63
      i2 = i2.shift_elems_left<2>();
64
      }
65
   else if(M == 0x00)
66
      {
67
      i1 = mask_lo.andc(i1);
68
      i2 = mask_lo.andc(i2);
69
      }
70
71
   return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd(
72
                       (__vector unsigned long)i1.raw(),
73
                       (__vector unsigned long)i2.raw())
74
      );
75
#endif
76
115k
   }
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<17>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&)
Line
Count
Source
37
35.9k
   {
38
35.9k
   static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
39
35.9k
40
35.9k
#if defined(BOTAN_SIMD_USE_SSE2)
41
35.9k
   return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
42
#elif defined(BOTAN_SIMD_USE_NEON)
43
   const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
44
   const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
45
   return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
46
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
47
   const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
48
49
   SIMD_4x32 i1 = x;
50
   SIMD_4x32 i2 = H;
51
52
   if(M == 0x11)
53
      {
54
      i1 &= mask_lo;
55
      i2 &= mask_lo;
56
      }
57
   else if(M == 0x10)
58
      {
59
      i1 = i1.shift_elems_left<2>();
60
      }
61
   else if(M == 0x01)
62
      {
63
      i2 = i2.shift_elems_left<2>();
64
      }
65
   else if(M == 0x00)
66
      {
67
      i1 = mask_lo.andc(i1);
68
      i2 = mask_lo.andc(i2);
69
      }
70
71
   return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd(
72
                       (__vector unsigned long)i1.raw(),
73
                       (__vector unsigned long)i2.raw())
74
      );
75
#endif
76
35.9k
   }
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<16>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&)
Line
Count
Source
37
7.28k
   {
38
7.28k
   static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
39
7.28k
40
7.28k
#if defined(BOTAN_SIMD_USE_SSE2)
41
7.28k
   return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
42
#elif defined(BOTAN_SIMD_USE_NEON)
43
   const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
44
   const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
45
   return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
46
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
47
   const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
48
49
   SIMD_4x32 i1 = x;
50
   SIMD_4x32 i2 = H;
51
52
   if(M == 0x11)
53
      {
54
      i1 &= mask_lo;
55
      i2 &= mask_lo;
56
      }
57
   else if(M == 0x10)
58
      {
59
      i1 = i1.shift_elems_left<2>();
60
      }
61
   else if(M == 0x01)
62
      {
63
      i2 = i2.shift_elems_left<2>();
64
      }
65
   else if(M == 0x00)
66
      {
67
      i1 = mask_lo.andc(i1);
68
      i2 = mask_lo.andc(i2);
69
      }
70
71
   return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd(
72
                       (__vector unsigned long)i1.raw(),
73
                       (__vector unsigned long)i2.raw())
74
      );
75
#endif
76
7.28k
   }
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<1>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&)
Line
Count
Source
37
7.28k
   {
38
7.28k
   static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
39
7.28k
40
7.28k
#if defined(BOTAN_SIMD_USE_SSE2)
41
7.28k
   return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
42
#elif defined(BOTAN_SIMD_USE_NEON)
43
   const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
44
   const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
45
   return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
46
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
47
   const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
48
49
   SIMD_4x32 i1 = x;
50
   SIMD_4x32 i2 = H;
51
52
   if(M == 0x11)
53
      {
54
      i1 &= mask_lo;
55
      i2 &= mask_lo;
56
      }
57
   else if(M == 0x10)
58
      {
59
      i1 = i1.shift_elems_left<2>();
60
      }
61
   else if(M == 0x01)
62
      {
63
      i2 = i2.shift_elems_left<2>();
64
      }
65
   else if(M == 0x00)
66
      {
67
      i1 = mask_lo.andc(i1);
68
      i2 = mask_lo.andc(i2);
69
      }
70
71
   return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd(
72
                       (__vector unsigned long)i1.raw(),
73
                       (__vector unsigned long)i2.raw())
74
      );
75
#endif
76
7.28k
   }
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<0>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&)
Line
Count
Source
37
64.5k
   {
38
64.5k
   static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
39
64.5k
40
64.5k
#if defined(BOTAN_SIMD_USE_SSE2)
41
64.5k
   return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
42
#elif defined(BOTAN_SIMD_USE_NEON)
43
   const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
44
   const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
45
   return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
46
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
47
   const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
48
49
   SIMD_4x32 i1 = x;
50
   SIMD_4x32 i2 = H;
51
52
   if(M == 0x11)
53
      {
54
      i1 &= mask_lo;
55
      i2 &= mask_lo;
56
      }
57
   else if(M == 0x10)
58
      {
59
      i1 = i1.shift_elems_left<2>();
60
      }
61
   else if(M == 0x01)
62
      {
63
      i2 = i2.shift_elems_left<2>();
64
      }
65
   else if(M == 0x00)
66
      {
67
      i1 = mask_lo.andc(i1);
68
      i2 = mask_lo.andc(i2);
69
      }
70
71
   return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd(
72
                       (__vector unsigned long)i1.raw(),
73
                       (__vector unsigned long)i2.raw())
74
      );
75
#endif
76
64.5k
   }
77
78
inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1)
79
14.4k
   {
80
14.4k
   SIMD_4x32 X0 = B1.shr<31>();
81
14.4k
   SIMD_4x32 X1 = B1.shl<1>();
82
14.4k
   SIMD_4x32 X2 = B0.shr<31>();
83
14.4k
   SIMD_4x32 X3 = B0.shl<1>();
84
14.4k
85
14.4k
   X3 |= X0.shift_elems_right<3>();
86
14.4k
   X3 |= X2.shift_elems_left<1>();
87
14.4k
   X1 |= X0.shift_elems_left<1>();
88
14.4k
89
14.4k
   X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();
90
14.4k
91
14.4k
   X1 ^= X0.shift_elems_left<3>();
92
14.4k
93
14.4k
   X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
94
14.4k
   X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
95
14.4k
   return X0;
96
14.4k
   }
97
98
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x)
99
7.28k
   {
100
7.28k
   SIMD_4x32 T0 = clmul<0x11>(H, x);
101
7.28k
   SIMD_4x32 T1 = clmul<0x10>(H, x);
102
7.28k
   SIMD_4x32 T2 = clmul<0x01>(H, x);
103
7.28k
   SIMD_4x32 T3 = clmul<0x00>(H, x);
104
7.28k
105
7.28k
   T1 ^= T2;
106
7.28k
   T0 ^= T1.shift_elems_right<2>();
107
7.28k
   T3 ^= T1.shift_elems_left<2>();
108
7.28k
109
7.28k
   return gcm_reduce(T0, T3);
110
7.28k
   }
111
112
inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA)
113
   gcm_multiply_x4(const SIMD_4x32& H1, const SIMD_4x32& H2, const SIMD_4x32& H3, const SIMD_4x32& H4,
114
                   const SIMD_4x32& X1, const SIMD_4x32& X2, const SIMD_4x32& X3, const SIMD_4x32& X4)
115
7.16k
   {
116
7.16k
   /*
117
7.16k
   * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
118
7.16k
   * and Pierre Laurent of Intel
119
7.16k
   */
120
7.16k
121
7.16k
   const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^
122
7.16k
                        (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));
123
7.16k
124
7.16k
   const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^
125
7.16k
                        (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));
126
7.16k
127
7.16k
   SIMD_4x32 T;
128
7.16k
129
7.16k
   T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
130
7.16k
   T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
131
7.16k
   T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
132
7.16k
   T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
133
7.16k
   T ^= lo;
134
7.16k
   T ^= hi;
135
7.16k
136
7.16k
   return gcm_reduce(hi ^ T.shift_elems_right<2>(),
137
7.16k
                     lo ^ T.shift_elems_left<2>());
138
7.16k
   }
139
140
}
141
142
BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
143
void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
144
952
   {
145
952
   const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes));
146
952
   const SIMD_4x32 H2 = gcm_multiply(H1, H1);
147
952
   const SIMD_4x32 H3 = gcm_multiply(H1, H2);
148
952
   const SIMD_4x32 H4 = gcm_multiply(H2, H2);
149
952
150
952
   H1.store_le(H_pow);
151
952
   H2.store_le(H_pow + 2);
152
952
   H3.store_le(H_pow + 4);
153
952
   H4.store_le(H_pow + 6);
154
952
   }
155
156
BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
157
void gcm_multiply_clmul(uint8_t x[16],
158
                        const uint64_t H_pow[8],
159
                        const uint8_t input[], size_t blocks)
160
4.23k
   {
161
4.23k
   /*
162
4.23k
   * Algorithms 1 and 5 from Intel's CLMUL guide
163
4.23k
   */
164
4.23k
   const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow);
165
4.23k
166
4.23k
   SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));
167
4.23k
168
4.23k
   if(blocks >= 4)
169
508
      {
170
508
      const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2);
171
508
      const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4);
172
508
      const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6);
173
508
174
7.67k
      while(blocks >= 4)
175
7.16k
         {
176
7.16k
         const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input       ));
177
7.16k
         const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16*1));
178
7.16k
         const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16*2));
179
7.16k
         const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16*3));
180
7.16k
181
7.16k
         a ^= m0;
182
7.16k
         a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);
183
7.16k
184
7.16k
         input += 4*16;
185
7.16k
         blocks -= 4;
186
7.16k
         }
187
508
      }
188
4.23k
189
8.65k
   for(size_t i = 0; i != blocks; ++i)
190
4.42k
      {
191
4.42k
      const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16*i));
192
4.42k
193
4.42k
      a ^= m;
194
4.42k
      a = gcm_multiply(H1, a);
195
4.42k
      }
196
4.23k
197
4.23k
   a = reverse_vector(a);
198
4.23k
   a.store_le(x);
199
4.23k
   }
200
201
}