Coverage Report

Created: 2024-11-21 07:03

/src/cryptopp/gf2n_simd.cpp
Line
Count
Source (jump to first uncovered line)
1
// gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
2
//                 Also based on PCLMULQDQ code by Jankowski, Laurent and
3
//                 O'Mahony from Intel (see reference below).
4
//
5
//    This source file uses intrinsics and built-ins to gain access to
6
//    CLMUL, ARMv8a, and Power8 instructions. A separate source file is
7
//    needed because additional CXXFLAGS are required to enable the
8
//    appropriate instructions sets in some build configurations.
9
//
10
//    Several speedups were taken from Intel Polynomial Multiplication
11
//    Instruction and its Usage for Elliptic Curve Cryptography, by
12
//    Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
13
//    https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
14
//    There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
15
//    The IACR paper performs some optimizations that the compiler is
16
//    expected to perform, like Common Subexpression Elimination to save
17
//    on variables (among others). Note that the compiler may miss the
18
//    optimization so the IACR paper is useful. However, the code is GPL3
19
//    and toxic for some users of the library, so it is not used here...
20
21
#include "pch.h"
22
#include "config.h"
23
24
#ifndef CRYPTOPP_IMPORTS
25
26
#include "gf2n.h"
27
28
#if (CRYPTOPP_CLMUL_AVAILABLE)
29
# include <emmintrin.h>
30
# include <wmmintrin.h>
31
#endif
32
33
#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
34
# include "arm_simd.h"
35
#endif
36
37
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
38
# include "ppc_simd.h"
39
#endif
40
41
// Squash MS LNK4221 and libtool warnings
42
extern const char GF2N_SIMD_FNAME[] = __FILE__;
43
44
ANONYMOUS_NAMESPACE_BEGIN
45
46
// ************************** ARMv8 ************************** //
47
48
using CryptoPP::word;
49
50
#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
51
52
// c1c0 = a * b
53
inline void
54
F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
55
{
56
    uint64x2_t t1, t2, z0={0};
57
58
    c0 = PMULL_00(a, b);
59
    c1 = PMULL_11(a, b);
60
    t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
61
    t1 = veorq_u64(a, t1);
62
    t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
63
    t2 = veorq_u64(b, t2);
64
    t1 = PMULL_00(t1, t2);
65
    t1 = veorq_u64(c0, t1);
66
    t1 = veorq_u64(c1, t1);
67
    t2 = t1;
68
    t1 = vextq_u64(z0, t1, 1);
69
    t2 = vextq_u64(t2, z0, 1);
70
    c0 = veorq_u64(c0, t1);
71
    c1 = veorq_u64(c1, t2);
72
}
73
74
// c3c2c1c0 = a1a0 * b1b0
75
inline void
76
F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
77
    const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
78
{
79
    uint64x2_t c4, c5;
80
    uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
81
82
    F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
83
    F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
84
85
    x0 = veorq_u64(x0, x1);
86
    y0 = veorq_u64(y0, y1);
87
88
    F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
89
90
    c4 = veorq_u64(c4, c0);
91
    c4 = veorq_u64(c4, c2);
92
    c5 = veorq_u64(c5, c1);
93
    c5 = veorq_u64(c5, c3);
94
    c1 = veorq_u64(c1, c4);
95
    c2 = veorq_u64(c2, c5);
96
}
97
98
// c3c2c1c0 = a1a0 * a1a0
99
inline void
100
F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
101
    uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
102
{
103
    c0 = PMULL_00(a0, a0);
104
    c1 = PMULL_11(a0, a0);
105
    c2 = PMULL_00(a1, a1);
106
    c3 = PMULL_11(a1, a1);
107
}
108
109
// x = (x << n), z = 0
110
template <unsigned int N>
111
inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
112
{
113
    uint64x2_t u=x, v, z={0};
114
    x = vshlq_n_u64(x, N);
115
    u = vshrq_n_u64(u, (64-N));
116
    v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
117
    x = vorrq_u64(x, v);
118
    return x;
119
}
120
121
// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
122
// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
123
inline void
124
GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
125
{
126
    const unsigned int mask[4] = {
127
        0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
128
    };
129
130
    uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0};
131
    m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
132
    b1 = c1; a1 = c1;
133
    a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
134
    a1 = vshlq_n_u64(a1, 23);
135
    a1 = vshrq_n_u64(a1, 23);
136
    c1 = vorrq_u64(a1, a0);
137
    b2 = vshrq_n_u64(c2, (64-23));
138
    c3 = ShiftLeft128_ARMv8<23>(c3);
139
    a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
140
    c3 = vorrq_u64(c3, a0);
141
    b1 = vshrq_n_u64(b1, (64-23));
142
    c2 = ShiftLeft128_ARMv8<23>(c2);
143
    a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
144
    c2 = vorrq_u64(c2, a0);
145
    b3 = c3;
146
    b2 = vshrq_n_u64(c2, (64-10));
147
    b3 = ShiftLeft128_ARMv8<10>(b3);
148
    a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
149
    b3 = vorrq_u64(b3, a0);
150
    a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
151
    b3 = veorq_u64(b3, a0);
152
    b1 = vshrq_n_u64(b3, (64-23));
153
    b3 = ShiftLeft128_ARMv8<23>(b3);
154
    b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
155
    b3 = vorrq_u64(b3, b1);
156
    c2 = veorq_u64(c2, b3);
157
    b3 = c3;
158
    b2 = vshrq_n_u64(c2, (64-10));
159
    b3 = ShiftLeft128_ARMv8<10>(b3);
160
    b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
161
    b3 = vorrq_u64(b3, b2);
162
    b2 = c2;
163
    b2 = ShiftLeft128_ARMv8<10>(b2);
164
    a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
165
    c2 = veorq_u64(c2, a0);
166
    a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
167
    a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
168
    a0 = vorrq_u64(a0, a1);
169
    c3 = veorq_u64(c3, a0);
170
    c0 = veorq_u64(c0, c2);
171
    c1 = veorq_u64(c1, c3);
172
    c1 = vandq_u64(c1, m0);
173
}
174
175
#endif
176
177
// ************************** SSE ************************** //
178
179
#if (CRYPTOPP_CLMUL_AVAILABLE)
180
181
using CryptoPP::word;
182
183
// c1c0 = a * b
184
inline void
185
F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
186
0
{
187
0
    __m128i t1, t2;
188
189
0
    c0 = _mm_clmulepi64_si128(a, b, 0x00);
190
0
    c1 = _mm_clmulepi64_si128(a, b, 0x11);
191
0
    t1 = _mm_shuffle_epi32(a, 0xEE);
192
0
    t1 = _mm_xor_si128(a, t1);
193
0
    t2 = _mm_shuffle_epi32(b, 0xEE);
194
0
    t2 = _mm_xor_si128(b, t2);
195
0
    t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
196
0
    t1 = _mm_xor_si128(c0, t1);
197
0
    t1 = _mm_xor_si128(c1, t1);
198
0
    t2 = t1;
199
0
    t1 = _mm_slli_si128(t1, 8);
200
0
    t2 = _mm_srli_si128(t2, 8);
201
0
    c0 = _mm_xor_si128(c0, t1);
202
0
    c1 = _mm_xor_si128(c1, t2);
203
0
}
204
205
// c3c2c1c0 = a1a0 * b1b0
206
inline void
207
F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
208
    const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
209
0
{
210
0
    __m128i c4, c5;
211
0
    __m128i x0=a0, x1=a1, y0=b0, y1=b1;
212
213
0
    F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
214
0
    F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
215
216
0
    x0 = _mm_xor_si128(x0, x1);
217
0
    y0 = _mm_xor_si128(y0, y1);
218
219
0
    F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
220
221
0
    c4 = _mm_xor_si128(c4, c0);
222
0
    c4 = _mm_xor_si128(c4, c2);
223
0
    c5 = _mm_xor_si128(c5, c1);
224
0
    c5 = _mm_xor_si128(c5, c3);
225
0
    c1 = _mm_xor_si128(c1, c4);
226
0
    c2 = _mm_xor_si128(c2, c5);
227
0
}
228
229
// c3c2c1c0 = a1a0 * a1a0
230
inline void
231
F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
232
    __m128i& c0, const __m128i& a1, const __m128i& a0)
233
0
{
234
0
    c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
235
0
    c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
236
0
    c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
237
0
    c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
238
0
}
239
240
// x = (x << n), z = 0
241
template <unsigned int N>
242
inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
243
0
{
244
0
    __m128i u=x, v;
245
0
    x = _mm_slli_epi64(x, N);
246
0
    u = _mm_srli_epi64(u, (64-N));
247
0
    v = _mm_unpacklo_epi64(z, u);
248
0
    x = _mm_or_si128(x, v);
249
0
    return x;
250
0
}
Unexecuted instantiation: gf2n_simd.cpp:long long __vector(2) (anonymous namespace)::ShiftLeft128_SSE<23u>(long long __vector(2), long long __vector(2) const&)
Unexecuted instantiation: gf2n_simd.cpp:long long __vector(2) (anonymous namespace)::ShiftLeft128_SSE<10u>(long long __vector(2), long long __vector(2) const&)
251
252
// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
253
// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
254
inline void
255
GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
256
0
{
257
0
    const unsigned int m[4] = {
258
0
        0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
259
0
    };
260
261
0
    __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0;
262
0
    m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
263
0
    z0 = _mm_setzero_si128();
264
0
    b1 = c1; a1 = c1;
265
0
    a0 = _mm_move_epi64(c1);
266
0
    a1 = _mm_slli_epi64(a1, 23);
267
0
    a1 = _mm_srli_epi64(a1, 23);
268
0
    c1 = _mm_or_si128(a1, a0);
269
0
    b2 = _mm_srli_epi64(c2, (64-23));
270
0
    c3 = ShiftLeft128_SSE<23>(c3, z0);
271
0
    a0 = _mm_unpackhi_epi64(b2, z0);
272
0
    c3 = _mm_or_si128(c3, a0);
273
0
    b1 = _mm_srli_epi64(b1, (64-23));
274
0
    c2 = ShiftLeft128_SSE<23>(c2, z0);
275
0
    a0 = _mm_unpackhi_epi64(b1, z0);
276
0
    c2 = _mm_or_si128(c2, a0);
277
0
    b3 = c3;
278
0
    b2 = _mm_srli_epi64(c2, (64-10));
279
0
    b3 = ShiftLeft128_SSE<10>(b3, z0);
280
0
    a0 = _mm_unpackhi_epi64(b2, z0);
281
0
    b3 = _mm_or_si128(b3, a0);
282
0
    a0 = _mm_unpackhi_epi64(c3, z0);
283
0
    b3 = _mm_xor_si128(b3, a0);
284
0
    b1 = _mm_srli_epi64(b3, (64-23));
285
0
    b3 = ShiftLeft128_SSE<23>(b3, z0);
286
0
    b3 = _mm_unpackhi_epi64(b3, z0);
287
0
    b3 = _mm_or_si128(b3, b1);
288
0
    c2 = _mm_xor_si128(c2, b3);
289
0
    b3 = c3;
290
0
    b2 = _mm_srli_epi64(c2, (64-10));
291
0
    b3 = ShiftLeft128_SSE<10>(b3, z0);
292
0
    b2 = _mm_unpackhi_epi64(b2, z0);
293
0
    b3 = _mm_or_si128(b3, b2);
294
0
    b2 = c2;
295
0
    b2 = ShiftLeft128_SSE<10>(b2, z0);
296
0
    a0 = _mm_unpacklo_epi64(z0, b2);
297
0
    c2 = _mm_xor_si128(c2, a0);
298
0
    a0 = _mm_unpacklo_epi64(z0, b3);
299
0
    a1 = _mm_unpackhi_epi64(b2, z0);
300
0
    a0 = _mm_or_si128(a0, a1);
301
0
    c3 = _mm_xor_si128(c3, a0);
302
0
    c0 = _mm_xor_si128(c0, c2);
303
0
    c1 = _mm_xor_si128(c1, c3);
304
0
    c1 = _mm_and_si128(c1, m0);
305
0
}
306
307
#endif
308
309
// ************************* Power8 ************************* //
310
311
#if (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
312
313
using CryptoPP::byte;
314
using CryptoPP::word;
315
using CryptoPP::uint8x16_p;
316
using CryptoPP::uint64x2_p;
317
318
using CryptoPP::VecLoad;
319
using CryptoPP::VecStore;
320
321
using CryptoPP::VecOr;
322
using CryptoPP::VecXor;
323
using CryptoPP::VecAnd;
324
325
using CryptoPP::VecPermute;
326
using CryptoPP::VecMergeLow;
327
using CryptoPP::VecMergeHigh;
328
using CryptoPP::VecShiftLeft;
329
using CryptoPP::VecShiftRight;
330
331
using CryptoPP::VecIntelMultiply00;
332
using CryptoPP::VecIntelMultiply11;
333
334
// c1c0 = a * b
335
inline void
336
F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
337
{
338
    uint64x2_p t1, t2;
339
    const uint64x2_p z0={0};
340
341
    c0 = VecIntelMultiply00(a, b);
342
    c1 = VecIntelMultiply11(a, b);
343
    t1 = VecMergeLow(a, a);
344
    t1 = VecXor(a, t1);
345
    t2 = VecMergeLow(b, b);
346
    t2 = VecXor(b, t2);
347
    t1 = VecIntelMultiply00(t1, t2);
348
    t1 = VecXor(c0, t1);
349
    t1 = VecXor(c1, t1);
350
    t2 = t1;
351
    t1 = VecMergeHigh(z0, t1);
352
    t2 = VecMergeLow(t2, z0);
353
    c0 = VecXor(c0, t1);
354
    c1 = VecXor(c1, t2);
355
}
356
357
// c3c2c1c0 = a1a0 * b1b0
358
inline void
359
F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
360
    const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
361
{
362
    uint64x2_p c4, c5;
363
    uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
364
365
    F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
366
    F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
367
368
    x0 = VecXor(x0, x1);
369
    y0 = VecXor(y0, y1);
370
371
    F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
372
373
    c4 = VecXor(c4, c0);
374
    c4 = VecXor(c4, c2);
375
    c5 = VecXor(c5, c1);
376
    c5 = VecXor(c5, c3);
377
    c1 = VecXor(c1, c4);
378
    c2 = VecXor(c2, c5);
379
}
380
381
// c3c2c1c0 = a1a0 * a1a0
382
inline void
383
F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
384
    uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
385
{
386
    c0 = VecIntelMultiply00(a0, a0);
387
    c1 = VecIntelMultiply11(a0, a0);
388
    c2 = VecIntelMultiply00(a1, a1);
389
    c3 = VecIntelMultiply11(a1, a1);
390
}
391
392
// x = (x << n), z = 0
393
template <unsigned int N>
394
inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
395
{
396
    uint64x2_p u=x, v;
397
    const uint64x2_p z={0};
398
399
    x = VecShiftLeft<N>(x);
400
    u = VecShiftRight<64-N>(u);
401
    v = VecMergeHigh(z, u);
402
    x = VecOr(x, v);
403
    return x;
404
}
405
406
// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
407
// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
408
inline void
409
GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
410
{
411
    const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
412
    const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
413
414
    uint64x2_p b3, b2, b1, /*b0,*/ a1, a0;
415
    const uint64x2_p z0={0};
416
417
    b1 = c1; a1 = c1;
418
    a0 = VecMergeHigh(c1, z0);
419
    a1 = VecShiftLeft<23>(a1);
420
    a1 = VecShiftRight<23>(a1);
421
    c1 = VecOr(a1, a0);
422
    b2 = VecShiftRight<64-23>(c2);
423
    c3 = ShiftLeft128_POWER8<23>(c3);
424
    a0 = VecMergeLow(b2, z0);
425
    c3 = VecOr(c3, a0);
426
    b1 = VecShiftRight<64-23>(b1);
427
    c2 = ShiftLeft128_POWER8<23>(c2);
428
    a0 = VecMergeLow(b1, z0);
429
    c2 = VecOr(c2, a0);
430
    b3 = c3;
431
    b2 = VecShiftRight<64-10>(c2);
432
    b3 = ShiftLeft128_POWER8<10>(b3);
433
    a0 = VecMergeLow(b2, z0);
434
    b3 = VecOr(b3, a0);
435
    a0 = VecMergeLow(c3, z0);
436
    b3 = VecXor(b3, a0);
437
    b1 = VecShiftRight<64-23>(b3);
438
    b3 = ShiftLeft128_POWER8<23>(b3);
439
    b3 = VecMergeLow(b3, z0);
440
    b3 = VecOr(b3, b1);
441
    c2 = VecXor(c2, b3);
442
    b3 = c3;
443
    b2 = VecShiftRight<64-10>(c2);
444
    b3 = ShiftLeft128_POWER8<10>(b3);
445
    b2 = VecMergeLow(b2, z0);
446
    b3 = VecOr(b3, b2);
447
    b2 = c2;
448
    b2 = ShiftLeft128_POWER8<10>(b2);
449
    a0 = VecMergeHigh(z0, b2);
450
    c2 = VecXor(c2, a0);
451
    a0 = VecMergeHigh(z0, b3);
452
    a1 = VecMergeLow(b2, z0);
453
    a0 = VecOr(a0, a1);
454
    c3 = VecXor(c3, a0);
455
    c0 = VecXor(c0, c2);
456
    c1 = VecXor(c1, c3);
457
    c1 = VecAnd(c1, m0);
458
}
459
460
#endif
461
462
ANONYMOUS_NAMESPACE_END
463
464
NAMESPACE_BEGIN(CryptoPP)
465
466
#if (CRYPTOPP_CLMUL_AVAILABLE)
467
468
void
469
GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
470
0
{
471
0
    enum {S=sizeof(__m128i)/sizeof(word)};
472
0
    __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S));
473
0
    __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S));
474
0
    __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+0*S));
475
0
    __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+1*S));
476
477
0
    __m128i c0, c1, c2, c3;
478
0
    F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
479
0
    GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
480
481
0
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0);
482
0
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1);
483
0
}
484
485
void
486
GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
487
0
{
488
0
    enum {S=sizeof(__m128i)/sizeof(word)};
489
0
    __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S));
490
0
    __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S));
491
492
0
    __m128i c0, c1, c2, c3;
493
0
    F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
494
0
    GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
495
496
0
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0);
497
0
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1);
498
0
}
499
500
#elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
501
502
void
503
GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
504
{
505
    // word is either 32-bit or 64-bit, depending on the platform.
506
    // Load using a 32-bit pointer to avoid possible alignment issues.
507
    const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
508
    const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);
509
510
    uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
511
    uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
512
    uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
513
    uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
514
515
    uint64x2_t c0, c1, c2, c3;
516
    F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
517
    GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
518
519
    uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
520
    vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
521
    vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
522
}
523
524
void
525
GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
526
{
527
    // word is either 32-bit or 64-bit, depending on the platform.
528
    // Load using a 32-bit pointer to avoid possible alignment issues.
529
    const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
530
    uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
531
    uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
532
533
    uint64x2_t c0, c1, c2, c3;
534
    F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
535
    GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
536
537
    uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
538
    vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
539
    vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
540
}
541
542
#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
543
544
void
545
GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
546
{
547
    // word is either 32-bit or 64-bit, depending on the platform.
548
    // Load using a byte pointer to avoid possible alignment issues.
549
    const byte* pAA = reinterpret_cast<const byte*>(pA);
550
    const byte* pBB = reinterpret_cast<const byte*>(pB);
551
552
    uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
553
    uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
554
    uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
555
    uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
556
557
#if (CRYPTOPP_BIG_ENDIAN)
558
    const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
559
    const uint8x16_p m = (uint8x16_p)VecLoad(mb);
560
    a0 = VecPermute(a0, m);
561
    a1 = VecPermute(a1, m);
562
    b0 = VecPermute(b0, m);
563
    b1 = VecPermute(b1, m);
564
#endif
565
566
    uint64x2_p c0, c1, c2, c3;
567
    F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
568
    GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
569
570
#if (CRYPTOPP_BIG_ENDIAN)
571
    c0 = VecPermute(c0, m);
572
    c1 = VecPermute(c1, m);
573
#endif
574
575
    byte* pCC = reinterpret_cast<byte*>(pC);
576
    VecStore(c0, pCC+0);
577
    VecStore(c1, pCC+16);
578
}
579
580
void
581
GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
582
{
583
    // word is either 32-bit or 64-bit, depending on the platform.
584
    // Load using a byte pointer to avoid possible alignment issues.
585
    const byte* pAA = reinterpret_cast<const byte*>(pA);
586
    uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
587
    uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
588
589
#if (CRYPTOPP_BIG_ENDIAN)
590
    const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
591
    const uint8x16_p m = (uint8x16_p)VecLoad(mb);
592
    a0 = VecPermute(a0, m);
593
    a1 = VecPermute(a1, m);
594
#endif
595
596
    uint64x2_p c0, c1, c2, c3;
597
    F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
598
    GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
599
600
#if (CRYPTOPP_BIG_ENDIAN)
601
    c0 = VecPermute(c0, m);
602
    c1 = VecPermute(c1, m);
603
#endif
604
605
    byte* pCC = reinterpret_cast<byte*>(pC);
606
    VecStore(c0, pCC+0);
607
    VecStore(c1, pCC+16);
608
}
609
610
#endif
611
612
NAMESPACE_END
613
614
#endif  // CRYPTOPP_IMPORTS