/src/cryptopp/gf2n_simd.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton |
2 | | // Also based on PCLMULQDQ code by Jankowski, Laurent and |
3 | | // O'Mahony from Intel (see reference below). |
4 | | // |
5 | | // This source file uses intrinsics and built-ins to gain access to |
6 | | // CLMUL, ARMv8a, and Power8 instructions. A separate source file is |
7 | | // needed because additional CXXFLAGS are required to enable the |
8 | | // appropriate instructions sets in some build configurations. |
9 | | // |
10 | | // Several speedups were taken from Intel Polynomial Multiplication |
11 | | // Instruction and its Usage for Elliptic Curve Cryptography, by |
12 | | // Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony, |
13 | | // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf |
14 | | // There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf. |
15 | | // The IACR paper performs some optimizations that the compiler is |
16 | | // expected to perform, like Common Subexpression Elimination to save |
17 | | // on variables (among others). Note that the compiler may miss the |
18 | | // optimization so the IACR paper is useful. However, the code is GPL3 |
19 | | // and toxic for some users of the library, so it is not used here... |
20 | | |
21 | | #include "pch.h" |
22 | | #include "config.h" |
23 | | |
24 | | #ifndef CRYPTOPP_IMPORTS |
25 | | |
26 | | #include "gf2n.h" |
27 | | |
28 | | #if (CRYPTOPP_CLMUL_AVAILABLE) |
29 | | # include <emmintrin.h> |
30 | | # include <wmmintrin.h> |
31 | | #endif |
32 | | |
33 | | #if (CRYPTOPP_ARM_PMULL_AVAILABLE) |
34 | | # include "arm_simd.h" |
35 | | #endif |
36 | | |
37 | | #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) |
38 | | # include "ppc_simd.h" |
39 | | #endif |
40 | | |
41 | | // Squash MS LNK4221 and libtool warnings |
42 | | extern const char GF2N_SIMD_FNAME[] = __FILE__; |
43 | | |
44 | | ANONYMOUS_NAMESPACE_BEGIN |
45 | | |
46 | | // ************************** ARMv8 ************************** // |
47 | | |
48 | | using CryptoPP::word; |
49 | | |
50 | | #if (CRYPTOPP_ARM_PMULL_AVAILABLE) |
51 | | |
52 | | // c1c0 = a * b |
53 | | inline void |
54 | | F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b) |
55 | | { |
56 | | uint64x2_t t1, t2, z0={0}; |
57 | | |
58 | | c0 = PMULL_00(a, b); |
59 | | c1 = PMULL_11(a, b); |
60 | | t1 = vmovq_n_u64(vgetq_lane_u64(a, 1)); |
61 | | t1 = veorq_u64(a, t1); |
62 | | t2 = vmovq_n_u64(vgetq_lane_u64(b, 1)); |
63 | | t2 = veorq_u64(b, t2); |
64 | | t1 = PMULL_00(t1, t2); |
65 | | t1 = veorq_u64(c0, t1); |
66 | | t1 = veorq_u64(c1, t1); |
67 | | t2 = t1; |
68 | | t1 = vextq_u64(z0, t1, 1); |
69 | | t2 = vextq_u64(t2, z0, 1); |
70 | | c0 = veorq_u64(c0, t1); |
71 | | c1 = veorq_u64(c1, t2); |
72 | | } |
73 | | |
74 | | // c3c2c1c0 = a1a0 * b1b0 |
75 | | inline void |
76 | | F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0, |
77 | | const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0) |
78 | | { |
79 | | uint64x2_t c4, c5; |
80 | | uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1; |
81 | | |
82 | | F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0); |
83 | | F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1); |
84 | | |
85 | | x0 = veorq_u64(x0, x1); |
86 | | y0 = veorq_u64(y0, y1); |
87 | | |
88 | | F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0); |
89 | | |
90 | | c4 = veorq_u64(c4, c0); |
91 | | c4 = veorq_u64(c4, c2); |
92 | | c5 = veorq_u64(c5, c1); |
93 | | c5 = veorq_u64(c5, c3); |
94 | | c1 = veorq_u64(c1, c4); |
95 | | c2 = veorq_u64(c2, c5); |
96 | | } |
97 | | |
98 | | // c3c2c1c0 = a1a0 * a1a0 |
99 | | inline void |
100 | | F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, |
101 | | uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0) |
102 | | { |
103 | | c0 = PMULL_00(a0, a0); |
104 | | c1 = PMULL_11(a0, a0); |
105 | | c2 = PMULL_00(a1, a1); |
106 | | c3 = PMULL_11(a1, a1); |
107 | | } |
108 | | |
109 | | // x = (x << n), z = 0 |
110 | | template <unsigned int N> |
111 | | inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x) |
112 | | { |
113 | | uint64x2_t u=x, v, z={0}; |
114 | | x = vshlq_n_u64(x, N); |
115 | | u = vshrq_n_u64(u, (64-N)); |
116 | | v = vcombine_u64(vget_low_u64(z), vget_low_u64(u)); |
117 | | x = vorrq_u64(x, v); |
118 | | return x; |
119 | | } |
120 | | |
121 | | // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at |
122 | | // Intel paper or https://github.com/antonblanchard/crc32-vpmsum. |
123 | | inline void |
124 | | GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0) |
125 | | { |
126 | | const unsigned int mask[4] = { |
127 | | 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff, |
128 | | }; |
129 | | |
130 | | uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0}; |
131 | | m0 = vreinterpretq_u64_u32(vld1q_u32(mask)); |
132 | | b1 = c1; a1 = c1; |
133 | | a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0)); |
134 | | a1 = vshlq_n_u64(a1, 23); |
135 | | a1 = vshrq_n_u64(a1, 23); |
136 | | c1 = vorrq_u64(a1, a0); |
137 | | b2 = vshrq_n_u64(c2, (64-23)); |
138 | | c3 = ShiftLeft128_ARMv8<23>(c3); |
139 | | a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); |
140 | | c3 = vorrq_u64(c3, a0); |
141 | | b1 = vshrq_n_u64(b1, (64-23)); |
142 | | c2 = ShiftLeft128_ARMv8<23>(c2); |
143 | | a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0)); |
144 | | c2 = vorrq_u64(c2, a0); |
145 | | b3 = c3; |
146 | | b2 = vshrq_n_u64(c2, (64-10)); |
147 | | b3 = ShiftLeft128_ARMv8<10>(b3); |
148 | | a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); |
149 | | b3 = vorrq_u64(b3, a0); |
150 | | a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0)); |
151 | | b3 = veorq_u64(b3, a0); |
152 | | b1 = vshrq_n_u64(b3, (64-23)); |
153 | | b3 = ShiftLeft128_ARMv8<23>(b3); |
154 | | b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0)); |
155 | | b3 = vorrq_u64(b3, b1); |
156 | | c2 = veorq_u64(c2, b3); |
157 | | b3 = c3; |
158 | | b2 = vshrq_n_u64(c2, (64-10)); |
159 | | b3 = ShiftLeft128_ARMv8<10>(b3); |
160 | | b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); |
161 | | b3 = vorrq_u64(b3, b2); |
162 | | b2 = c2; |
163 | | b2 = ShiftLeft128_ARMv8<10>(b2); |
164 | | a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2)); |
165 | | c2 = veorq_u64(c2, a0); |
166 | | a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3)); |
167 | | a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); |
168 | | a0 = vorrq_u64(a0, a1); |
169 | | c3 = veorq_u64(c3, a0); |
170 | | c0 = veorq_u64(c0, c2); |
171 | | c1 = veorq_u64(c1, c3); |
172 | | c1 = vandq_u64(c1, m0); |
173 | | } |
174 | | |
175 | | #endif |
176 | | |
177 | | // ************************** SSE ************************** // |
178 | | |
179 | | #if (CRYPTOPP_CLMUL_AVAILABLE) |
180 | | |
181 | | using CryptoPP::word; |
182 | | |
183 | | // c1c0 = a * b |
184 | | inline void |
185 | | F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b) |
186 | 0 | { |
187 | 0 | __m128i t1, t2; |
188 | |
|
189 | 0 | c0 = _mm_clmulepi64_si128(a, b, 0x00); |
190 | 0 | c1 = _mm_clmulepi64_si128(a, b, 0x11); |
191 | 0 | t1 = _mm_shuffle_epi32(a, 0xEE); |
192 | 0 | t1 = _mm_xor_si128(a, t1); |
193 | 0 | t2 = _mm_shuffle_epi32(b, 0xEE); |
194 | 0 | t2 = _mm_xor_si128(b, t2); |
195 | 0 | t1 = _mm_clmulepi64_si128(t1, t2, 0x00); |
196 | 0 | t1 = _mm_xor_si128(c0, t1); |
197 | 0 | t1 = _mm_xor_si128(c1, t1); |
198 | 0 | t2 = t1; |
199 | 0 | t1 = _mm_slli_si128(t1, 8); |
200 | 0 | t2 = _mm_srli_si128(t2, 8); |
201 | 0 | c0 = _mm_xor_si128(c0, t1); |
202 | 0 | c1 = _mm_xor_si128(c1, t2); |
203 | 0 | } |
204 | | |
205 | | // c3c2c1c0 = a1a0 * b1b0 |
206 | | inline void |
207 | | F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0, |
208 | | const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0) |
209 | 0 | { |
210 | 0 | __m128i c4, c5; |
211 | 0 | __m128i x0=a0, x1=a1, y0=b0, y1=b1; |
212 | |
|
213 | 0 | F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0); |
214 | 0 | F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1); |
215 | |
|
216 | 0 | x0 = _mm_xor_si128(x0, x1); |
217 | 0 | y0 = _mm_xor_si128(y0, y1); |
218 | |
|
219 | 0 | F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0); |
220 | |
|
221 | 0 | c4 = _mm_xor_si128(c4, c0); |
222 | 0 | c4 = _mm_xor_si128(c4, c2); |
223 | 0 | c5 = _mm_xor_si128(c5, c1); |
224 | 0 | c5 = _mm_xor_si128(c5, c3); |
225 | 0 | c1 = _mm_xor_si128(c1, c4); |
226 | 0 | c2 = _mm_xor_si128(c2, c5); |
227 | 0 | } |
228 | | |
229 | | // c3c2c1c0 = a1a0 * a1a0 |
230 | | inline void |
231 | | F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, |
232 | | __m128i& c0, const __m128i& a1, const __m128i& a0) |
233 | 0 | { |
234 | 0 | c0 = _mm_clmulepi64_si128(a0, a0, 0x00); |
235 | 0 | c1 = _mm_clmulepi64_si128(a0, a0, 0x11); |
236 | 0 | c2 = _mm_clmulepi64_si128(a1, a1, 0x00); |
237 | 0 | c3 = _mm_clmulepi64_si128(a1, a1, 0x11); |
238 | 0 | } |
239 | | |
240 | | // x = (x << n), z = 0 |
241 | | template <unsigned int N> |
242 | | inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z) |
243 | 0 | { |
244 | 0 | __m128i u=x, v; |
245 | 0 | x = _mm_slli_epi64(x, N); |
246 | 0 | u = _mm_srli_epi64(u, (64-N)); |
247 | 0 | v = _mm_unpacklo_epi64(z, u); |
248 | 0 | x = _mm_or_si128(x, v); |
249 | 0 | return x; |
250 | 0 | } Unexecuted instantiation: gf2n_simd.cpp:long long __vector(2) (anonymous namespace)::ShiftLeft128_SSE<23u>(long long __vector(2), long long __vector(2) const&) Unexecuted instantiation: gf2n_simd.cpp:long long __vector(2) (anonymous namespace)::ShiftLeft128_SSE<10u>(long long __vector(2), long long __vector(2) const&) |
251 | | |
252 | | // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at |
253 | | // Intel paper or https://github.com/antonblanchard/crc32-vpmsum. |
254 | | inline void |
255 | | GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0) |
256 | 0 | { |
257 | 0 | const unsigned int m[4] = { |
258 | 0 | 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff |
259 | 0 | }; |
260 | |
|
261 | 0 | __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0; |
262 | 0 | m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]); |
263 | 0 | z0 = _mm_setzero_si128(); |
264 | 0 | b1 = c1; a1 = c1; |
265 | 0 | a0 = _mm_move_epi64(c1); |
266 | 0 | a1 = _mm_slli_epi64(a1, 23); |
267 | 0 | a1 = _mm_srli_epi64(a1, 23); |
268 | 0 | c1 = _mm_or_si128(a1, a0); |
269 | 0 | b2 = _mm_srli_epi64(c2, (64-23)); |
270 | 0 | c3 = ShiftLeft128_SSE<23>(c3, z0); |
271 | 0 | a0 = _mm_unpackhi_epi64(b2, z0); |
272 | 0 | c3 = _mm_or_si128(c3, a0); |
273 | 0 | b1 = _mm_srli_epi64(b1, (64-23)); |
274 | 0 | c2 = ShiftLeft128_SSE<23>(c2, z0); |
275 | 0 | a0 = _mm_unpackhi_epi64(b1, z0); |
276 | 0 | c2 = _mm_or_si128(c2, a0); |
277 | 0 | b3 = c3; |
278 | 0 | b2 = _mm_srli_epi64(c2, (64-10)); |
279 | 0 | b3 = ShiftLeft128_SSE<10>(b3, z0); |
280 | 0 | a0 = _mm_unpackhi_epi64(b2, z0); |
281 | 0 | b3 = _mm_or_si128(b3, a0); |
282 | 0 | a0 = _mm_unpackhi_epi64(c3, z0); |
283 | 0 | b3 = _mm_xor_si128(b3, a0); |
284 | 0 | b1 = _mm_srli_epi64(b3, (64-23)); |
285 | 0 | b3 = ShiftLeft128_SSE<23>(b3, z0); |
286 | 0 | b3 = _mm_unpackhi_epi64(b3, z0); |
287 | 0 | b3 = _mm_or_si128(b3, b1); |
288 | 0 | c2 = _mm_xor_si128(c2, b3); |
289 | 0 | b3 = c3; |
290 | 0 | b2 = _mm_srli_epi64(c2, (64-10)); |
291 | 0 | b3 = ShiftLeft128_SSE<10>(b3, z0); |
292 | 0 | b2 = _mm_unpackhi_epi64(b2, z0); |
293 | 0 | b3 = _mm_or_si128(b3, b2); |
294 | 0 | b2 = c2; |
295 | 0 | b2 = ShiftLeft128_SSE<10>(b2, z0); |
296 | 0 | a0 = _mm_unpacklo_epi64(z0, b2); |
297 | 0 | c2 = _mm_xor_si128(c2, a0); |
298 | 0 | a0 = _mm_unpacklo_epi64(z0, b3); |
299 | 0 | a1 = _mm_unpackhi_epi64(b2, z0); |
300 | 0 | a0 = _mm_or_si128(a0, a1); |
301 | 0 | c3 = _mm_xor_si128(c3, a0); |
302 | 0 | c0 = _mm_xor_si128(c0, c2); |
303 | 0 | c1 = _mm_xor_si128(c1, c3); |
304 | 0 | c1 = _mm_and_si128(c1, m0); |
305 | 0 | } |
306 | | |
307 | | #endif |
308 | | |
309 | | // ************************* Power8 ************************* // |
310 | | |
311 | | #if (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0 |
312 | | |
313 | | using CryptoPP::byte; |
314 | | using CryptoPP::word; |
315 | | using CryptoPP::uint8x16_p; |
316 | | using CryptoPP::uint64x2_p; |
317 | | |
318 | | using CryptoPP::VecLoad; |
319 | | using CryptoPP::VecStore; |
320 | | |
321 | | using CryptoPP::VecOr; |
322 | | using CryptoPP::VecXor; |
323 | | using CryptoPP::VecAnd; |
324 | | |
325 | | using CryptoPP::VecPermute; |
326 | | using CryptoPP::VecMergeLow; |
327 | | using CryptoPP::VecMergeHigh; |
328 | | using CryptoPP::VecShiftLeft; |
329 | | using CryptoPP::VecShiftRight; |
330 | | |
331 | | using CryptoPP::VecIntelMultiply00; |
332 | | using CryptoPP::VecIntelMultiply11; |
333 | | |
334 | | // c1c0 = a * b |
335 | | inline void |
336 | | F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b) |
337 | | { |
338 | | uint64x2_p t1, t2; |
339 | | const uint64x2_p z0={0}; |
340 | | |
341 | | c0 = VecIntelMultiply00(a, b); |
342 | | c1 = VecIntelMultiply11(a, b); |
343 | | t1 = VecMergeLow(a, a); |
344 | | t1 = VecXor(a, t1); |
345 | | t2 = VecMergeLow(b, b); |
346 | | t2 = VecXor(b, t2); |
347 | | t1 = VecIntelMultiply00(t1, t2); |
348 | | t1 = VecXor(c0, t1); |
349 | | t1 = VecXor(c1, t1); |
350 | | t2 = t1; |
351 | | t1 = VecMergeHigh(z0, t1); |
352 | | t2 = VecMergeLow(t2, z0); |
353 | | c0 = VecXor(c0, t1); |
354 | | c1 = VecXor(c1, t2); |
355 | | } |
356 | | |
357 | | // c3c2c1c0 = a1a0 * b1b0 |
358 | | inline void |
359 | | F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0, |
360 | | const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0) |
361 | | { |
362 | | uint64x2_p c4, c5; |
363 | | uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1; |
364 | | |
365 | | F2N_Multiply_128x128_POWER8(c1, c0, x0, y0); |
366 | | F2N_Multiply_128x128_POWER8(c3, c2, x1, y1); |
367 | | |
368 | | x0 = VecXor(x0, x1); |
369 | | y0 = VecXor(y0, y1); |
370 | | |
371 | | F2N_Multiply_128x128_POWER8(c5, c4, x0, y0); |
372 | | |
373 | | c4 = VecXor(c4, c0); |
374 | | c4 = VecXor(c4, c2); |
375 | | c5 = VecXor(c5, c1); |
376 | | c5 = VecXor(c5, c3); |
377 | | c1 = VecXor(c1, c4); |
378 | | c2 = VecXor(c2, c5); |
379 | | } |
380 | | |
381 | | // c3c2c1c0 = a1a0 * a1a0 |
382 | | inline void |
383 | | F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, |
384 | | uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0) |
385 | | { |
386 | | c0 = VecIntelMultiply00(a0, a0); |
387 | | c1 = VecIntelMultiply11(a0, a0); |
388 | | c2 = VecIntelMultiply00(a1, a1); |
389 | | c3 = VecIntelMultiply11(a1, a1); |
390 | | } |
391 | | |
392 | | // x = (x << n), z = 0 |
393 | | template <unsigned int N> |
394 | | inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x) |
395 | | { |
396 | | uint64x2_p u=x, v; |
397 | | const uint64x2_p z={0}; |
398 | | |
399 | | x = VecShiftLeft<N>(x); |
400 | | u = VecShiftRight<64-N>(u); |
401 | | v = VecMergeHigh(z, u); |
402 | | x = VecOr(x, v); |
403 | | return x; |
404 | | } |
405 | | |
406 | | // c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at |
407 | | // Intel paper or https://github.com/antonblanchard/crc32-vpmsum. |
408 | | inline void |
409 | | GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0) |
410 | | { |
411 | | const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)}; |
412 | | const uint64x2_p m0 = (uint64x2_p)VecLoad(mod); |
413 | | |
414 | | uint64x2_p b3, b2, b1, /*b0,*/ a1, a0; |
415 | | const uint64x2_p z0={0}; |
416 | | |
417 | | b1 = c1; a1 = c1; |
418 | | a0 = VecMergeHigh(c1, z0); |
419 | | a1 = VecShiftLeft<23>(a1); |
420 | | a1 = VecShiftRight<23>(a1); |
421 | | c1 = VecOr(a1, a0); |
422 | | b2 = VecShiftRight<64-23>(c2); |
423 | | c3 = ShiftLeft128_POWER8<23>(c3); |
424 | | a0 = VecMergeLow(b2, z0); |
425 | | c3 = VecOr(c3, a0); |
426 | | b1 = VecShiftRight<64-23>(b1); |
427 | | c2 = ShiftLeft128_POWER8<23>(c2); |
428 | | a0 = VecMergeLow(b1, z0); |
429 | | c2 = VecOr(c2, a0); |
430 | | b3 = c3; |
431 | | b2 = VecShiftRight<64-10>(c2); |
432 | | b3 = ShiftLeft128_POWER8<10>(b3); |
433 | | a0 = VecMergeLow(b2, z0); |
434 | | b3 = VecOr(b3, a0); |
435 | | a0 = VecMergeLow(c3, z0); |
436 | | b3 = VecXor(b3, a0); |
437 | | b1 = VecShiftRight<64-23>(b3); |
438 | | b3 = ShiftLeft128_POWER8<23>(b3); |
439 | | b3 = VecMergeLow(b3, z0); |
440 | | b3 = VecOr(b3, b1); |
441 | | c2 = VecXor(c2, b3); |
442 | | b3 = c3; |
443 | | b2 = VecShiftRight<64-10>(c2); |
444 | | b3 = ShiftLeft128_POWER8<10>(b3); |
445 | | b2 = VecMergeLow(b2, z0); |
446 | | b3 = VecOr(b3, b2); |
447 | | b2 = c2; |
448 | | b2 = ShiftLeft128_POWER8<10>(b2); |
449 | | a0 = VecMergeHigh(z0, b2); |
450 | | c2 = VecXor(c2, a0); |
451 | | a0 = VecMergeHigh(z0, b3); |
452 | | a1 = VecMergeLow(b2, z0); |
453 | | a0 = VecOr(a0, a1); |
454 | | c3 = VecXor(c3, a0); |
455 | | c0 = VecXor(c0, c2); |
456 | | c1 = VecXor(c1, c3); |
457 | | c1 = VecAnd(c1, m0); |
458 | | } |
459 | | |
460 | | #endif |
461 | | |
462 | | ANONYMOUS_NAMESPACE_END |
463 | | |
464 | | NAMESPACE_BEGIN(CryptoPP) |
465 | | |
466 | | #if (CRYPTOPP_CLMUL_AVAILABLE) |
467 | | |
468 | | void |
469 | | GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC) |
470 | 0 | { |
471 | 0 | enum {S=sizeof(__m128i)/sizeof(word)}; |
472 | 0 | __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S)); |
473 | 0 | __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S)); |
474 | 0 | __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+0*S)); |
475 | 0 | __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+1*S)); |
476 | |
|
477 | 0 | __m128i c0, c1, c2, c3; |
478 | 0 | F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0); |
479 | 0 | GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0); |
480 | |
|
481 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0); |
482 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1); |
483 | 0 | } |
484 | | |
485 | | void |
486 | | GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC) |
487 | 0 | { |
488 | 0 | enum {S=sizeof(__m128i)/sizeof(word)}; |
489 | 0 | __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S)); |
490 | 0 | __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S)); |
491 | |
|
492 | 0 | __m128i c0, c1, c2, c3; |
493 | 0 | F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0); |
494 | 0 | GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0); |
495 | |
|
496 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0); |
497 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1); |
498 | 0 | } |
499 | | |
500 | | #elif (CRYPTOPP_ARM_PMULL_AVAILABLE) |
501 | | |
502 | | void |
503 | | GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC) |
504 | | { |
505 | | // word is either 32-bit or 64-bit, depending on the platform. |
506 | | // Load using a 32-bit pointer to avoid possible alignment issues. |
507 | | const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA); |
508 | | const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB); |
509 | | |
510 | | uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0)); |
511 | | uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4)); |
512 | | uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0)); |
513 | | uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4)); |
514 | | |
515 | | uint64x2_t c0, c1, c2, c3; |
516 | | F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0); |
517 | | GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0); |
518 | | |
519 | | uint32_t* pCC = reinterpret_cast<uint32_t*>(pC); |
520 | | vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0)); |
521 | | vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1)); |
522 | | } |
523 | | |
524 | | void |
525 | | GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC) |
526 | | { |
527 | | // word is either 32-bit or 64-bit, depending on the platform. |
528 | | // Load using a 32-bit pointer to avoid possible alignment issues. |
529 | | const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA); |
530 | | uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0)); |
531 | | uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4)); |
532 | | |
533 | | uint64x2_t c0, c1, c2, c3; |
534 | | F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0); |
535 | | GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0); |
536 | | |
537 | | uint32_t* pCC = reinterpret_cast<uint32_t*>(pC); |
538 | | vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0)); |
539 | | vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1)); |
540 | | } |
541 | | |
542 | | #elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0 |
543 | | |
544 | | void |
545 | | GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC) |
546 | | { |
547 | | // word is either 32-bit or 64-bit, depending on the platform. |
548 | | // Load using a byte pointer to avoid possible alignment issues. |
549 | | const byte* pAA = reinterpret_cast<const byte*>(pA); |
550 | | const byte* pBB = reinterpret_cast<const byte*>(pB); |
551 | | |
552 | | uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0); |
553 | | uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16); |
554 | | uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0); |
555 | | uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16); |
556 | | |
557 | | #if (CRYPTOPP_BIG_ENDIAN) |
558 | | const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; |
559 | | const uint8x16_p m = (uint8x16_p)VecLoad(mb); |
560 | | a0 = VecPermute(a0, m); |
561 | | a1 = VecPermute(a1, m); |
562 | | b0 = VecPermute(b0, m); |
563 | | b1 = VecPermute(b1, m); |
564 | | #endif |
565 | | |
566 | | uint64x2_p c0, c1, c2, c3; |
567 | | F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0); |
568 | | GF2NT_233_Reduce_POWER8(c3, c2, c1, c0); |
569 | | |
570 | | #if (CRYPTOPP_BIG_ENDIAN) |
571 | | c0 = VecPermute(c0, m); |
572 | | c1 = VecPermute(c1, m); |
573 | | #endif |
574 | | |
575 | | byte* pCC = reinterpret_cast<byte*>(pC); |
576 | | VecStore(c0, pCC+0); |
577 | | VecStore(c1, pCC+16); |
578 | | } |
579 | | |
580 | | void |
581 | | GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC) |
582 | | { |
583 | | // word is either 32-bit or 64-bit, depending on the platform. |
584 | | // Load using a byte pointer to avoid possible alignment issues. |
585 | | const byte* pAA = reinterpret_cast<const byte*>(pA); |
586 | | uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0); |
587 | | uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16); |
588 | | |
589 | | #if (CRYPTOPP_BIG_ENDIAN) |
590 | | const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; |
591 | | const uint8x16_p m = (uint8x16_p)VecLoad(mb); |
592 | | a0 = VecPermute(a0, m); |
593 | | a1 = VecPermute(a1, m); |
594 | | #endif |
595 | | |
596 | | uint64x2_p c0, c1, c2, c3; |
597 | | F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0); |
598 | | GF2NT_233_Reduce_POWER8(c3, c2, c1, c0); |
599 | | |
600 | | #if (CRYPTOPP_BIG_ENDIAN) |
601 | | c0 = VecPermute(c0, m); |
602 | | c1 = VecPermute(c1, m); |
603 | | #endif |
604 | | |
605 | | byte* pCC = reinterpret_cast<byte*>(pC); |
606 | | VecStore(c0, pCC+0); |
607 | | VecStore(c1, pCC+16); |
608 | | } |
609 | | |
610 | | #endif |
611 | | |
612 | | NAMESPACE_END |
613 | | |
614 | | #endif // CRYPTOPP_IMPORTS |