/src/botan/src/lib/utils/ghash/ghash_cpu/ghash_cpu.cpp
Line | Count | Source |
1 | | /* |
2 | | * Hook for CLMUL/PMULL/VPMSUM |
3 | | * (C) 2013,2017,2019,2020 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/ghash.h> |
9 | | #include <botan/internal/simd_32.h> |
10 | | |
11 | | #if defined(BOTAN_SIMD_USE_SSE2) |
12 | | #include <immintrin.h> |
13 | | #include <wmmintrin.h> |
14 | | #endif |
15 | | |
16 | | namespace Botan { |
17 | | |
18 | | namespace { |
19 | | |
20 | | BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in) |
21 | 11.4k | { |
22 | 11.4k | #if defined(BOTAN_SIMD_USE_SSE2) |
23 | 11.4k | const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
24 | 11.4k | return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK)); |
25 | | #elif defined(BOTAN_SIMD_USE_NEON) |
26 | | const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; |
27 | | const uint8x16_t mask = vld1q_u8(maskb); |
28 | | return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask))); |
29 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) |
30 | | const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; |
31 | | return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask)); |
32 | | #endif |
33 | 11.4k | } |
34 | | |
35 | | template<int M> |
36 | | BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x) |
37 | 29.4k | { |
38 | 29.4k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); |
39 | | |
40 | 29.4k | #if defined(BOTAN_SIMD_USE_SSE2) |
41 | 29.4k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); |
42 | | #elif defined(BOTAN_SIMD_USE_NEON) |
43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); |
44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); |
45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); |
46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) |
47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); |
48 | | |
49 | | SIMD_4x32 i1 = x; |
50 | | SIMD_4x32 i2 = H; |
51 | | |
52 | | if(M == 0x11) |
53 | | { |
54 | | i1 &= mask_lo; |
55 | | i2 &= mask_lo; |
56 | | } |
57 | | else if(M == 0x10) |
58 | | { |
59 | | i1 = i1.shift_elems_left<2>(); |
60 | | } |
61 | | else if(M == 0x01) |
62 | | { |
63 | | i2 = i2.shift_elems_left<2>(); |
64 | | } |
65 | | else if(M == 0x00) |
66 | | { |
67 | | i1 = mask_lo.andc(i1); |
68 | | i2 = mask_lo.andc(i2); |
69 | | } |
70 | | |
71 | | auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw()); |
72 | | auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw()); |
73 | | |
74 | | #if defined(__clang__) |
75 | | auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v); |
76 | | #else |
77 | | auto rv = __builtin_crypto_vpmsumd(i1v, i2v); |
78 | | #endif |
79 | | |
80 | | return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv)); |
81 | | #endif |
82 | 29.4k | } ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<17>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 8.89k | { | 38 | 8.89k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | | | 40 | 8.89k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 8.89k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw()); | 72 | | auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw()); | 73 | | | 74 | | #if defined(__clang__) | 75 | | auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v); | 76 | | #else | 77 | | auto rv = __builtin_crypto_vpmsumd(i1v, i2v); | 78 | | #endif | 79 | | | 80 | | return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv)); | 81 | | #endif | 82 | 8.89k | } |
ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<16>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 2.79k | { | 38 | 2.79k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | | | 40 | 2.79k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 2.79k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw()); | 72 | | auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw()); | 73 | | | 74 | | #if defined(__clang__) | 75 | | auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v); | 76 | | #else | 77 | | auto rv = __builtin_crypto_vpmsumd(i1v, i2v); | 78 | | #endif | 79 | | | 80 | | return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv)); | 81 | | #endif | 82 | 2.79k | } |
ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<1>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 2.79k | { | 38 | 2.79k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | | | 40 | 2.79k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 2.79k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw()); | 72 | | auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw()); | 73 | | | 74 | | #if defined(__clang__) | 75 | | auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v); | 76 | | #else | 77 | | auto rv = __builtin_crypto_vpmsumd(i1v, i2v); | 78 | | #endif | 79 | | | 80 | | return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv)); | 81 | | #endif | 82 | 2.79k | } |
ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<0>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Line | Count | Source | 37 | 14.9k | { | 38 | 14.9k | static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode"); | 39 | | | 40 | 14.9k | #if defined(BOTAN_SIMD_USE_SSE2) | 41 | 14.9k | return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M)); | 42 | | #elif defined(BOTAN_SIMD_USE_NEON) | 43 | | const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01); | 44 | | const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4); | 45 | | return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b))); | 46 | | #elif defined(BOTAN_SIMD_USE_ALTIVEC) | 47 | | const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF); | 48 | | | 49 | | SIMD_4x32 i1 = x; | 50 | | SIMD_4x32 i2 = H; | 51 | | | 52 | | if(M == 0x11) | 53 | | { | 54 | | i1 &= mask_lo; | 55 | | i2 &= mask_lo; | 56 | | } | 57 | | else if(M == 0x10) | 58 | | { | 59 | | i1 = i1.shift_elems_left<2>(); | 60 | | } | 61 | | else if(M == 0x01) | 62 | | { | 63 | | i2 = i2.shift_elems_left<2>(); | 64 | | } | 65 | | else if(M == 0x00) | 66 | | { | 67 | | i1 = mask_lo.andc(i1); | 68 | | i2 = mask_lo.andc(i2); | 69 | | } | 70 | | | 71 | | auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw()); | 72 | | auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw()); | 73 | | | 74 | | #if defined(__clang__) | 75 | | auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v); | 76 | | #else | 77 | | auto rv = __builtin_crypto_vpmsumd(i1v, i2v); | 78 | | #endif | 79 | | | 80 | | return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv)); | 81 | | #endif | 82 | 14.9k | } |
|
83 | | |
84 | | inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1) |
85 | 4.32k | { |
86 | 4.32k | SIMD_4x32 X0 = B1.shr<31>(); |
87 | 4.32k | SIMD_4x32 X1 = B1.shl<1>(); |
88 | 4.32k | SIMD_4x32 X2 = B0.shr<31>(); |
89 | 4.32k | SIMD_4x32 X3 = B0.shl<1>(); |
90 | | |
91 | 4.32k | X3 |= X0.shift_elems_right<3>(); |
92 | 4.32k | X3 |= X2.shift_elems_left<1>(); |
93 | 4.32k | X1 |= X0.shift_elems_left<1>(); |
94 | | |
95 | 4.32k | X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>(); |
96 | | |
97 | 4.32k | X1 ^= X0.shift_elems_left<3>(); |
98 | | |
99 | 4.32k | X0 = X1 ^ X3 ^ X0.shift_elems_right<1>(); |
100 | 4.32k | X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>(); |
101 | 4.32k | return X0; |
102 | 4.32k | } |
103 | | |
104 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x) |
105 | 2.79k | { |
106 | 2.79k | SIMD_4x32 T0 = clmul<0x11>(H, x); |
107 | 2.79k | SIMD_4x32 T1 = clmul<0x10>(H, x); |
108 | 2.79k | SIMD_4x32 T2 = clmul<0x01>(H, x); |
109 | 2.79k | SIMD_4x32 T3 = clmul<0x00>(H, x); |
110 | | |
111 | 2.79k | T1 ^= T2; |
112 | 2.79k | T0 ^= T1.shift_elems_right<2>(); |
113 | 2.79k | T3 ^= T1.shift_elems_left<2>(); |
114 | | |
115 | 2.79k | return gcm_reduce(T0, T3); |
116 | 2.79k | } |
117 | | |
118 | | inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) |
119 | | gcm_multiply_x4(const SIMD_4x32& H1, const SIMD_4x32& H2, const SIMD_4x32& H3, const SIMD_4x32& H4, |
120 | | const SIMD_4x32& X1, const SIMD_4x32& X2, const SIMD_4x32& X3, const SIMD_4x32& X4) |
121 | 1.52k | { |
122 | | /* |
123 | | * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski |
124 | | * and Pierre Laurent of Intel |
125 | | */ |
126 | | |
127 | 1.52k | const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^ |
128 | 1.52k | (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4)); |
129 | | |
130 | 1.52k | const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^ |
131 | 1.52k | (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4)); |
132 | | |
133 | 1.52k | SIMD_4x32 T; |
134 | | |
135 | 1.52k | T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>()); |
136 | 1.52k | T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>()); |
137 | 1.52k | T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>()); |
138 | 1.52k | T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>()); |
139 | 1.52k | T ^= lo; |
140 | 1.52k | T ^= hi; |
141 | | |
142 | 1.52k | return gcm_reduce(hi ^ T.shift_elems_right<2>(), |
143 | 1.52k | lo ^ T.shift_elems_left<2>()); |
144 | 1.52k | } |
145 | | |
146 | | } |
147 | | |
148 | | BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
149 | | void GHASH::ghash_precompute_cpu(const uint8_t H_bytes[16], uint64_t H_pow[4*2]) |
150 | 368 | { |
151 | 368 | const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes)); |
152 | 368 | const SIMD_4x32 H2 = gcm_multiply(H1, H1); |
153 | 368 | const SIMD_4x32 H3 = gcm_multiply(H1, H2); |
154 | 368 | const SIMD_4x32 H4 = gcm_multiply(H2, H2); |
155 | | |
156 | 368 | H1.store_le(H_pow); |
157 | 368 | H2.store_le(H_pow + 2); |
158 | 368 | H3.store_le(H_pow + 4); |
159 | 368 | H4.store_le(H_pow + 6); |
160 | 368 | } |
161 | | |
162 | | BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) |
163 | | void GHASH::ghash_multiply_cpu(uint8_t x[16], |
164 | | const uint64_t H_pow[8], |
165 | | const uint8_t input[], size_t blocks) |
166 | 1.64k | { |
167 | | /* |
168 | | * Algorithms 1 and 5 from Intel's CLMUL guide |
169 | | */ |
170 | 1.64k | const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow); |
171 | | |
172 | 1.64k | SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x)); |
173 | | |
174 | 1.64k | if(blocks >= 4) |
175 | 285 | { |
176 | 285 | const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2); |
177 | 285 | const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4); |
178 | 285 | const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6); |
179 | | |
180 | 1.80k | while(blocks >= 4) |
181 | 1.52k | { |
182 | 1.52k | const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input )); |
183 | 1.52k | const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16*1)); |
184 | 1.52k | const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16*2)); |
185 | 1.52k | const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16*3)); |
186 | | |
187 | 1.52k | a ^= m0; |
188 | 1.52k | a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a); |
189 | | |
190 | 1.52k | input += 4*16; |
191 | 1.52k | blocks -= 4; |
192 | 1.52k | } |
193 | 285 | } |
194 | | |
195 | 3.34k | for(size_t i = 0; i != blocks; ++i) |
196 | 1.69k | { |
197 | 1.69k | const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16*i)); |
198 | | |
199 | 1.69k | a ^= m; |
200 | 1.69k | a = gcm_multiply(H1, a); |
201 | 1.69k | } |
202 | | |
203 | 1.64k | a = reverse_vector(a); |
204 | 1.64k | a.store_le(x); |
205 | 1.64k | } |
206 | | |
207 | | } |