/src/botan/src/lib/modes/aead/gcm/clmul_cpu/clmul_cpu.cpp
Line | Count | Source |
1 | | /* |
2 | | * Hook for CLMUL/PMULL/VPMSUM |
3 | | * (C) 2013,2017,2019,2020 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/clmul_cpu.h> |
9 | | #include <botan/internal/simd_32.h> |
10 | | |
11 | | #if defined(BOTAN_SIMD_USE_SSE2) |
12 | | #include <immintrin.h> |
13 | | #include <wmmintrin.h> |
14 | | #endif |
15 | | |
16 | | namespace Botan { |
17 | | |
18 | | namespace { |
19 | | |
20 | | BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in) |
21 | 42.4k | { |
22 | 42.4k | #if defined(BOTAN_SIMD_USE_SSE2) |
23 | 42.4k | const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
24 | 42.4k | return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK)); |
25 | | #elif defined(BOTAN_SIMD_USE_NEON) |
26 | | const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; |
27 | | const uint8x16_t mask = vld1q_u8(maskb); |
28 | | return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask))); |
29 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) |
30 | | const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; |
31 | | return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask)); |
32 | | #endif |
33 | 42.4k | } |
34 | | |
35 | | template<int M> |
36 | | BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x) |
37 | 115k | { |
38 | 115k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); |
39 | 115k | |
40 | 115k | #if defined(BOTAN_SIMD_USE_SSE2) |
41 | 115k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); |
42 | | #elif defined(BOTAN_SIMD_USE_NEON) |
43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); |
44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); |
45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); |
46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) |
47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); |
48 | | |
49 | | SIMD_4x32 i1 = x; |
50 | | SIMD_4x32 i2 = H; |
51 | | |
52 | | if(M == 0x11) |
53 | | { |
54 | | i1 &= mask_lo; |
55 | | i2 &= mask_lo; |
56 | | } |
57 | | else if(M == 0x10) |
58 | | { |
59 | | i1 = i1.shift_elems_left<2>(); |
60 | | } |
61 | | else if(M == 0x01) |
62 | | { |
63 | | i2 = i2.shift_elems_left<2>(); |
64 | | } |
65 | | else if(M == 0x00) |
66 | | { |
67 | | i1 = mask_lo.andc(i1); |
68 | | i2 = mask_lo.andc(i2); |
69 | | } |
70 | | |
71 | | return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd( |
72 | | (__vector unsigned long)i1.raw(), |
73 | | (__vector unsigned long)i2.raw()) |
74 | | ); |
75 | | #endif |
76 | 115k | } clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<17>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 35.9k | { | 38 | 35.9k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | 35.9k | | 40 | 35.9k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 35.9k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd( | 72 | | (__vector unsigned long)i1.raw(), | 73 | | (__vector unsigned long)i2.raw()) | 74 | | ); | 75 | | #endif | 76 | 35.9k | } |
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<16>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 7.28k | { | 38 | 7.28k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | 7.28k | | 40 | 7.28k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 7.28k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd( | 72 | | (__vector unsigned long)i1.raw(), | 73 | | (__vector unsigned long)i2.raw()) | 74 | | ); | 75 | | #endif | 76 | 7.28k | } |
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<1>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 7.28k | { | 38 | 7.28k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | 7.28k | | 40 | 7.28k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 7.28k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd( | 72 | | (__vector unsigned long)i1.raw(), | 73 | | (__vector unsigned long)i2.raw()) | 74 | | ); | 75 | | #endif | 76 | 7.28k | } |
clmul_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<0>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 64.5k | { | 38 | 64.5k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | 64.5k | | 40 | 64.5k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 64.5k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | return SIMD_4x32((__vector unsigned int)__builtin_crypto_vpmsumd( | 72 | | (__vector unsigned long)i1.raw(), | 73 | | (__vector unsigned long)i2.raw()) | 74 | | ); | 75 | | #endif | 76 | 64.5k | } |
|
77 | | |
78 | | inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1) |
79 | 14.4k | { |
80 | 14.4k | SIMD_4x32 X0 = B1.shr<31>(); |
81 | 14.4k | SIMD_4x32 X1 = B1.shl<1>(); |
82 | 14.4k | SIMD_4x32 X2 = B0.shr<31>(); |
83 | 14.4k | SIMD_4x32 X3 = B0.shl<1>(); |
84 | 14.4k | |
85 | 14.4k | X3 |= X0.shift_elems_right<3>(); |
86 | 14.4k | X3 |= X2.shift_elems_left<1>(); |
87 | 14.4k | X1 |= X0.shift_elems_left<1>(); |
88 | 14.4k | |
89 | 14.4k | X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>(); |
90 | 14.4k | |
91 | 14.4k | X1 ^= X0.shift_elems_left<3>(); |
92 | 14.4k | |
93 | 14.4k | X0 = X1 ^ X3 ^ X0.shift_elems_right<1>(); |
94 | 14.4k | X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>(); |
95 | 14.4k | return X0; |
96 | 14.4k | } |
97 | | |
98 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x) |
99 | 7.28k | { |
100 | 7.28k | SIMD_4x32 T0 = clmul<0x11>(H, x); |
101 | 7.28k | SIMD_4x32 T1 = clmul<0x10>(H, x); |
102 | 7.28k | SIMD_4x32 T2 = clmul<0x01>(H, x); |
103 | 7.28k | SIMD_4x32 T3 = clmul<0x00>(H, x); |
104 | 7.28k | |
105 | 7.28k | T1 ^= T2; |
106 | 7.28k | T0 ^= T1.shift_elems_right<2>(); |
107 | 7.28k | T3 ^= T1.shift_elems_left<2>(); |
108 | 7.28k | |
109 | 7.28k | return gcm_reduce(T0, T3); |
110 | 7.28k | } |
111 | | |
112 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) |
113 | | gcm_multiply_x4(const SIMD_4x32& H1, const SIMD_4x32& H2, const SIMD_4x32& H3, const SIMD_4x32& H4, |
114 | | const SIMD_4x32& X1, const SIMD_4x32& X2, const SIMD_4x32& X3, const SIMD_4x32& X4) |
115 | 7.16k | { |
116 | 7.16k | /* |
117 | 7.16k | * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski |
118 | 7.16k | * and Pierre Laurent of Intel |
119 | 7.16k | */ |
120 | 7.16k | |
121 | 7.16k | const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^ |
122 | 7.16k | (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4)); |
123 | 7.16k | |
124 | 7.16k | const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^ |
125 | 7.16k | (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4)); |
126 | 7.16k | |
127 | 7.16k | SIMD_4x32 T; |
128 | 7.16k | |
129 | 7.16k | T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>()); |
130 | 7.16k | T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>()); |
131 | 7.16k | T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>()); |
132 | 7.16k | T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>()); |
133 | 7.16k | T ^= lo; |
134 | 7.16k | T ^= hi; |
135 | 7.16k | |
136 | 7.16k | return gcm_reduce(hi ^ T.shift_elems_right<2>(), |
137 | 7.16k | lo ^ T.shift_elems_left<2>()); |
138 | 7.16k | } |
139 | | |
140 | | } |
141 | | |
142 | | BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
143 | | void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) |
144 | 952 | { |
145 | 952 | const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes)); |
146 | 952 | const SIMD_4x32 H2 = gcm_multiply(H1, H1); |
147 | 952 | const SIMD_4x32 H3 = gcm_multiply(H1, H2); |
148 | 952 | const SIMD_4x32 H4 = gcm_multiply(H2, H2); |
149 | 952 | |
150 | 952 | H1.store_le(H_pow); |
151 | 952 | H2.store_le(H_pow + 2); |
152 | 952 | H3.store_le(H_pow + 4); |
153 | 952 | H4.store_le(H_pow + 6); |
154 | 952 | } |
155 | | |
156 | | BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
157 | | void gcm_multiply_clmul(uint8_t x[16], |
158 | | const uint64_t H_pow[8], |
159 | | const uint8_t input[], size_t blocks) |
160 | 4.23k | { |
161 | 4.23k | /* |
162 | 4.23k | * Algorithms 1 and 5 from Intel's CLMUL guide |
163 | 4.23k | */ |
164 | 4.23k | const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow); |
165 | 4.23k | |
166 | 4.23k | SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x)); |
167 | 4.23k | |
168 | 4.23k | if(blocks >= 4) |
169 | 508 | { |
170 | 508 | const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2); |
171 | 508 | const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4); |
172 | 508 | const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6); |
173 | 508 | |
174 | 7.67k | while(blocks >= 4) |
175 | 7.16k | { |
176 | 7.16k | const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input )); |
177 | 7.16k | const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16*1)); |
178 | 7.16k | const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16*2)); |
179 | 7.16k | const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16*3)); |
180 | 7.16k | |
181 | 7.16k | a ^= m0; |
182 | 7.16k | a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a); |
183 | 7.16k | |
184 | 7.16k | input += 4*16; |
185 | 7.16k | blocks -= 4; |
186 | 7.16k | } |
187 | 508 | } |
188 | 4.23k | |
189 | 8.65k | for(size_t i = 0; i != blocks; ++i) |
190 | 4.42k | { |
191 | 4.42k | const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16*i)); |
192 | 4.42k | |
193 | 4.42k | a ^= m; |
194 | 4.42k | a = gcm_multiply(H1, a); |
195 | 4.42k | } |
196 | 4.23k | |
197 | 4.23k | a = reverse_vector(a); |
198 | 4.23k | a.store_le(x); |
199 | 4.23k | } |
200 | | |
201 | | } |