/src/cryptopp/gf2n_simd.cpp

Source (jump to first uncovered line)
// gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
//                 Also based on PCLMULQDQ code by Jankowski, Laurent and
//                 O'Mahony from Intel (see reference below).
//
//    This source file uses intrinsics and built-ins to gain access to
//    CLMUL, ARMv8a, and Power8 instructions. A separate source file is
//    needed because additional CXXFLAGS are required to enable the
//    appropriate instructions sets in some build configurations.
//
//    Several speedups were taken from Intel Polynomial Multiplication
//    Instruction and its Usage for Elliptic Curve Cryptography, by
//    Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
//    https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
//    There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
//    The IACR paper performs some optimizations that the compiler is
//    expected to perform, like Common Subexpression Elimination to save
//    on variables (among others). Note that the compiler may miss the
//    optimization so the IACR paper is useful. However, the code is GPL3
//    and toxic for some users of the library, so it is not used here...

#include "pch.h"
#include "config.h"

#ifndef CRYPTOPP_IMPORTS

#include "gf2n.h"

#if (CRYPTOPP_CLMUL_AVAILABLE)
# include <emmintrin.h>
# include <wmmintrin.h>
#endif

#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
# include "arm_simd.h"
#endif

#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
# include "ppc_simd.h"
#endif

// Squash MS LNK4221 and libtool warnings
extern const char GF2N_SIMD_FNAME[] = __FILE__;

ANONYMOUS_NAMESPACE_BEGIN

// ************************** ARMv8 ************************** //

using CryptoPP::word;

#if (CRYPTOPP_ARM_PMULL_AVAILABLE)

// c1c0 = a * b
inline void
F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
{
    uint64x2_t t1, t2, z0={0};

    c0 = PMULL_00(a, b);
    c1 = PMULL_11(a, b);
    t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
    t1 = veorq_u64(a, t1);
    t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
    t2 = veorq_u64(b, t2);
    t1 = PMULL_00(t1, t2);
    t1 = veorq_u64(c0, t1);
    t1 = veorq_u64(c1, t1);
    t2 = t1;
    t1 = vextq_u64(z0, t1, 1);
    t2 = vextq_u64(t2, z0, 1);
    c0 = veorq_u64(c0, t1);
    c1 = veorq_u64(c1, t2);
}

// c3c2c1c0 = a1a0 * b1b0
inline void
F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
    const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
{
    uint64x2_t c4, c5;
    uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;

    F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
    F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);

    x0 = veorq_u64(x0, x1);
    y0 = veorq_u64(y0, y1);

    F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);

    c4 = veorq_u64(c4, c0);
    c4 = veorq_u64(c4, c2);
    c5 = veorq_u64(c5, c1);
    c5 = veorq_u64(c5, c3);
    c1 = veorq_u64(c1, c4);
    c2 = veorq_u64(c2, c5);
}

// c3c2c1c0 = a1a0 * a1a0
inline void
F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
    uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
{
    c0 = PMULL_00(a0, a0);
    c1 = PMULL_11(a0, a0);
    c2 = PMULL_00(a1, a1);
    c3 = PMULL_11(a1, a1);
}

// x = (x << n), z = 0
template <unsigned int N>
inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
{
    uint64x2_t u=x, v, z={0};
    x = vshlq_n_u64(x, N);
    u = vshrq_n_u64(u, (64-N));
    v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
    x = vorrq_u64(x, v);
    return x;
}

// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
inline void
GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
{
    const unsigned int mask[4] = {
        0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
    };

    uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0};
    m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
    b1 = c1; a1 = c1;
    a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
    a1 = vshlq_n_u64(a1, 23);
    a1 = vshrq_n_u64(a1, 23);
    c1 = vorrq_u64(a1, a0);
    b2 = vshrq_n_u64(c2, (64-23));
    c3 = ShiftLeft128_ARMv8<23>(c3);
    a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
    c3 = vorrq_u64(c3, a0);
    b1 = vshrq_n_u64(b1, (64-23));
    c2 = ShiftLeft128_ARMv8<23>(c2);
    a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
    c2 = vorrq_u64(c2, a0);
    b3 = c3;
    b2 = vshrq_n_u64(c2, (64-10));
    b3 = ShiftLeft128_ARMv8<10>(b3);
    a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
    b3 = vorrq_u64(b3, a0);
    a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
    b3 = veorq_u64(b3, a0);
    b1 = vshrq_n_u64(b3, (64-23));
    b3 = ShiftLeft128_ARMv8<23>(b3);
    b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
    b3 = vorrq_u64(b3, b1);
    c2 = veorq_u64(c2, b3);
    b3 = c3;
    b2 = vshrq_n_u64(c2, (64-10));
    b3 = ShiftLeft128_ARMv8<10>(b3);
    b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
    b3 = vorrq_u64(b3, b2);
    b2 = c2;
    b2 = ShiftLeft128_ARMv8<10>(b2);
    a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
    c2 = veorq_u64(c2, a0);
    a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
    a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
    a0 = vorrq_u64(a0, a1);
    c3 = veorq_u64(c3, a0);
    c0 = veorq_u64(c0, c2);
    c1 = veorq_u64(c1, c3);
    c1 = vandq_u64(c1, m0);
}

#endif

// ************************** SSE ************************** //

#if (CRYPTOPP_CLMUL_AVAILABLE)

using CryptoPP::word;

// c1c0 = a * b
inline void
F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
{
    __m128i t1, t2;

    c0 = _mm_clmulepi64_si128(a, b, 0x00);
    c1 = _mm_clmulepi64_si128(a, b, 0x11);
    t1 = _mm_shuffle_epi32(a, 0xEE);
    t1 = _mm_xor_si128(a, t1);
    t2 = _mm_shuffle_epi32(b, 0xEE);
    t2 = _mm_xor_si128(b, t2);
    t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
    t1 = _mm_xor_si128(c0, t1);
    t1 = _mm_xor_si128(c1, t1);
    t2 = t1;
    t1 = _mm_slli_si128(t1, 8);
    t2 = _mm_srli_si128(t2, 8);
    c0 = _mm_xor_si128(c0, t1);
    c1 = _mm_xor_si128(c1, t2);
}

// c3c2c1c0 = a1a0 * b1b0
inline void
F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
    const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
{
    __m128i c4, c5;
    __m128i x0=a0, x1=a1, y0=b0, y1=b1;

    F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
    F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);

    x0 = _mm_xor_si128(x0, x1);
    y0 = _mm_xor_si128(y0, y1);

    F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);

    c4 = _mm_xor_si128(c4, c0);
    c4 = _mm_xor_si128(c4, c2);
    c5 = _mm_xor_si128(c5, c1);
    c5 = _mm_xor_si128(c5, c3);
    c1 = _mm_xor_si128(c1, c4);
    c2 = _mm_xor_si128(c2, c5);
}

// c3c2c1c0 = a1a0 * a1a0
inline void
F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
    __m128i& c0, const __m128i& a1, const __m128i& a0)
{
    c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
    c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
    c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
    c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
}

// x = (x << n), z = 0
template <unsigned int N>
inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
{
    __m128i u=x, v;
    x = _mm_slli_epi64(x, N);
    u = _mm_srli_epi64(u, (64-N));
    v = _mm_unpacklo_epi64(z, u);
    x = _mm_or_si128(x, v);
    return x;
}

// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
inline void
GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
{
    const unsigned int m[4] = {
        0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
    };

    __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0;
    m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
    z0 = _mm_setzero_si128();
    b1 = c1; a1 = c1;
    a0 = _mm_move_epi64(c1);
    a1 = _mm_slli_epi64(a1, 23);
    a1 = _mm_srli_epi64(a1, 23);
    c1 = _mm_or_si128(a1, a0);
    b2 = _mm_srli_epi64(c2, (64-23));
    c3 = ShiftLeft128_SSE<23>(c3, z0);
    a0 = _mm_unpackhi_epi64(b2, z0);
    c3 = _mm_or_si128(c3, a0);
    b1 = _mm_srli_epi64(b1, (64-23));
    c2 = ShiftLeft128_SSE<23>(c2, z0);
    a0 = _mm_unpackhi_epi64(b1, z0);
    c2 = _mm_or_si128(c2, a0);
    b3 = c3;
    b2 = _mm_srli_epi64(c2, (64-10));
    b3 = ShiftLeft128_SSE<10>(b3, z0);
    a0 = _mm_unpackhi_epi64(b2, z0);
    b3 = _mm_or_si128(b3, a0);
    a0 = _mm_unpackhi_epi64(c3, z0);
    b3 = _mm_xor_si128(b3, a0);
    b1 = _mm_srli_epi64(b3, (64-23));
    b3 = ShiftLeft128_SSE<23>(b3, z0);
    b3 = _mm_unpackhi_epi64(b3, z0);
    b3 = _mm_or_si128(b3, b1);
    c2 = _mm_xor_si128(c2, b3);
    b3 = c3;
    b2 = _mm_srli_epi64(c2, (64-10));
    b3 = ShiftLeft128_SSE<10>(b3, z0);
    b2 = _mm_unpackhi_epi64(b2, z0);
    b3 = _mm_or_si128(b3, b2);
    b2 = c2;
    b2 = ShiftLeft128_SSE<10>(b2, z0);
    a0 = _mm_unpacklo_epi64(z0, b2);
    c2 = _mm_xor_si128(c2, a0);
    a0 = _mm_unpacklo_epi64(z0, b3);
    a1 = _mm_unpackhi_epi64(b2, z0);
    a0 = _mm_or_si128(a0, a1);
    c3 = _mm_xor_si128(c3, a0);
    c0 = _mm_xor_si128(c0, c2);
    c1 = _mm_xor_si128(c1, c3);
    c1 = _mm_and_si128(c1, m0);
}

#endif

// ************************* Power8 ************************* //

#if (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0

using CryptoPP::byte;
using CryptoPP::word;
using CryptoPP::uint8x16_p;
using CryptoPP::uint64x2_p;

using CryptoPP::VecLoad;
using CryptoPP::VecStore;

using CryptoPP::VecOr;
using CryptoPP::VecXor;
using CryptoPP::VecAnd;

using CryptoPP::VecPermute;
using CryptoPP::VecMergeLow;
using CryptoPP::VecMergeHigh;
using CryptoPP::VecShiftLeft;
using CryptoPP::VecShiftRight;

using CryptoPP::VecIntelMultiply00;
using CryptoPP::VecIntelMultiply11;

// c1c0 = a * b
inline void
F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
{
    uint64x2_p t1, t2;
    const uint64x2_p z0={0};

    c0 = VecIntelMultiply00(a, b);
    c1 = VecIntelMultiply11(a, b);
    t1 = VecMergeLow(a, a);
    t1 = VecXor(a, t1);
    t2 = VecMergeLow(b, b);
    t2 = VecXor(b, t2);
    t1 = VecIntelMultiply00(t1, t2);
    t1 = VecXor(c0, t1);
    t1 = VecXor(c1, t1);
    t2 = t1;
    t1 = VecMergeHigh(z0, t1);
    t2 = VecMergeLow(t2, z0);
    c0 = VecXor(c0, t1);
    c1 = VecXor(c1, t2);
}

// c3c2c1c0 = a1a0 * b1b0
inline void
F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
    const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
{
    uint64x2_p c4, c5;
    uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;

    F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
    F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);

    x0 = VecXor(x0, x1);
    y0 = VecXor(y0, y1);

    F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);

    c4 = VecXor(c4, c0);
    c4 = VecXor(c4, c2);
    c5 = VecXor(c5, c1);
    c5 = VecXor(c5, c3);
    c1 = VecXor(c1, c4);
    c2 = VecXor(c2, c5);
}

// c3c2c1c0 = a1a0 * a1a0
inline void
F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
    uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
{
    c0 = VecIntelMultiply00(a0, a0);
    c1 = VecIntelMultiply11(a0, a0);
    c2 = VecIntelMultiply00(a1, a1);
    c3 = VecIntelMultiply11(a1, a1);
}

// x = (x << n), z = 0
template <unsigned int N>
inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
{
    uint64x2_p u=x, v;
    const uint64x2_p z={0};

    x = VecShiftLeft<N>(x);
    u = VecShiftRight<64-N>(u);
    v = VecMergeHigh(z, u);
    x = VecOr(x, v);
    return x;
}

// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
inline void
GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
{
    const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
    const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);

    uint64x2_p b3, b2, b1, /*b0,*/ a1, a0;
    const uint64x2_p z0={0};

    b1 = c1; a1 = c1;
    a0 = VecMergeHigh(c1, z0);
    a1 = VecShiftLeft<23>(a1);
    a1 = VecShiftRight<23>(a1);
    c1 = VecOr(a1, a0);
    b2 = VecShiftRight<64-23>(c2);
    c3 = ShiftLeft128_POWER8<23>(c3);
    a0 = VecMergeLow(b2, z0);
    c3 = VecOr(c3, a0);
    b1 = VecShiftRight<64-23>(b1);
    c2 = ShiftLeft128_POWER8<23>(c2);
    a0 = VecMergeLow(b1, z0);
    c2 = VecOr(c2, a0);
    b3 = c3;
    b2 = VecShiftRight<64-10>(c2);
    b3 = ShiftLeft128_POWER8<10>(b3);
    a0 = VecMergeLow(b2, z0);
    b3 = VecOr(b3, a0);
    a0 = VecMergeLow(c3, z0);
    b3 = VecXor(b3, a0);
    b1 = VecShiftRight<64-23>(b3);
    b3 = ShiftLeft128_POWER8<23>(b3);
    b3 = VecMergeLow(b3, z0);
    b3 = VecOr(b3, b1);
    c2 = VecXor(c2, b3);
    b3 = c3;
    b2 = VecShiftRight<64-10>(c2);
    b3 = ShiftLeft128_POWER8<10>(b3);
    b2 = VecMergeLow(b2, z0);
    b3 = VecOr(b3, b2);
    b2 = c2;
    b2 = ShiftLeft128_POWER8<10>(b2);
    a0 = VecMergeHigh(z0, b2);
    c2 = VecXor(c2, a0);
    a0 = VecMergeHigh(z0, b3);
    a1 = VecMergeLow(b2, z0);
    a0 = VecOr(a0, a1);
    c3 = VecXor(c3, a0);
    c0 = VecXor(c0, c2);
    c1 = VecXor(c1, c3);
    c1 = VecAnd(c1, m0);
}

#endif

ANONYMOUS_NAMESPACE_END

NAMESPACE_BEGIN(CryptoPP)

#if (CRYPTOPP_CLMUL_AVAILABLE)

void
GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
{
    enum {S=sizeof(__m128i)/sizeof(word)};
    __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S));
    __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S));
    __m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+0*S));
    __m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pB+1*S));

    __m128i c0, c1, c2, c3;
    F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
    GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);

    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1);
}

void
GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
{
    enum {S=sizeof(__m128i)/sizeof(word)};
    __m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+0*S));
    __m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pA+1*S));

    __m128i c0, c1, c2, c3;
    F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
    GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);

    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+0*S), c0);
    _mm_storeu_si128(reinterpret_cast<__m128i*>(pC+1*S), c1);
}

#elif (CRYPTOPP_ARM_PMULL_AVAILABLE)

void
GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
{
    // word is either 32-bit or 64-bit, depending on the platform.
    // Load using a 32-bit pointer to avoid possible alignment issues.
    const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
    const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);

    uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
    uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
    uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
    uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));

    uint64x2_t c0, c1, c2, c3;
    F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
    GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);

    uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
    vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
    vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
}

void
GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
{
    // word is either 32-bit or 64-bit, depending on the platform.
    // Load using a 32-bit pointer to avoid possible alignment issues.
    const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
    uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
    uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));

    uint64x2_t c0, c1, c2, c3;
    F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
    GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);

    uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
    vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
    vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
}

#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0

void
GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
{
    // word is either 32-bit or 64-bit, depending on the platform.
    // Load using a byte pointer to avoid possible alignment issues.
    const byte* pAA = reinterpret_cast<const byte*>(pA);
    const byte* pBB = reinterpret_cast<const byte*>(pB);

    uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
    uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
    uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
    uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);

#if (CRYPTOPP_BIG_ENDIAN)
    const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
    const uint8x16_p m = (uint8x16_p)VecLoad(mb);
    a0 = VecPermute(a0, m);
    a1 = VecPermute(a1, m);
    b0 = VecPermute(b0, m);
    b1 = VecPermute(b1, m);
#endif

    uint64x2_p c0, c1, c2, c3;
    F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
    GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);

#if (CRYPTOPP_BIG_ENDIAN)
    c0 = VecPermute(c0, m);
    c1 = VecPermute(c1, m);
#endif

    byte* pCC = reinterpret_cast<byte*>(pC);
    VecStore(c0, pCC+0);
    VecStore(c1, pCC+16);
}

void
GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
{
    // word is either 32-bit or 64-bit, depending on the platform.
    // Load using a byte pointer to avoid possible alignment issues.
    const byte* pAA = reinterpret_cast<const byte*>(pA);
    uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
    uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);

#if (CRYPTOPP_BIG_ENDIAN)
    const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
    const uint8x16_p m = (uint8x16_p)VecLoad(mb);
    a0 = VecPermute(a0, m);
    a1 = VecPermute(a1, m);
#endif

    uint64x2_p c0, c1, c2, c3;
    F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
    GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);

#if (CRYPTOPP_BIG_ENDIAN)
    c0 = VecPermute(c0, m);
    c1 = VecPermute(c1, m);
#endif

    byte* pCC = reinterpret_cast<byte*>(pC);
    VecStore(c0, pCC+0);
    VecStore(c1, pCC+16);
}

#endif

NAMESPACE_END

#endif  // CRYPTOPP_IMPORTS

Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		// gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton
2		// Also based on PCLMULQDQ code by Jankowski, Laurent and
3		// O'Mahony from Intel (see reference below).
4		//
5		// This source file uses intrinsics and built-ins to gain access to
6		// CLMUL, ARMv8a, and Power8 instructions. A separate source file is
7		// needed because additional CXXFLAGS are required to enable the
8		// appropriate instructions sets in some build configurations.
9		//
10		// Several speedups were taken from Intel Polynomial Multiplication
11		// Instruction and its Usage for Elliptic Curve Cryptography, by
12		// Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony,
13		// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf
14		// There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf.
15		// The IACR paper performs some optimizations that the compiler is
16		// expected to perform, like Common Subexpression Elimination to save
17		// on variables (among others). Note that the compiler may miss the
18		// optimization so the IACR paper is useful. However, the code is GPL3
19		// and toxic for some users of the library, so it is not used here...
20
21		#include "pch.h"
22		#include "config.h"
23
24		#ifndef CRYPTOPP_IMPORTS
25
26		#include "gf2n.h"
27
28		#if (CRYPTOPP_CLMUL_AVAILABLE)
29		# include <emmintrin.h>
30		# include <wmmintrin.h>
31		#endif
32
33		#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
34		# include "arm_simd.h"
35		#endif
36
37		#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
38		# include "ppc_simd.h"
39		#endif
40
41		// Squash MS LNK4221 and libtool warnings
42		extern const char GF2N_SIMD_FNAME[] = __FILE__;
43
44		ANONYMOUS_NAMESPACE_BEGIN
45
46		// ************************ ARMv8 ************************ //
47
48		using CryptoPP::word;
49
50		#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
51
52		// c1c0 = a * b
53		inline void
54		F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b)
55		{
56		uint64x2_t t1, t2, z0={0};
57
58		c0 = PMULL_00(a, b);
59		c1 = PMULL_11(a, b);
60		t1 = vmovq_n_u64(vgetq_lane_u64(a, 1));
61		t1 = veorq_u64(a, t1);
62		t2 = vmovq_n_u64(vgetq_lane_u64(b, 1));
63		t2 = veorq_u64(b, t2);
64		t1 = PMULL_00(t1, t2);
65		t1 = veorq_u64(c0, t1);
66		t1 = veorq_u64(c1, t1);
67		t2 = t1;
68		t1 = vextq_u64(z0, t1, 1);
69		t2 = vextq_u64(t2, z0, 1);
70		c0 = veorq_u64(c0, t1);
71		c1 = veorq_u64(c1, t2);
72		}
73
74		// c3c2c1c0 = a1a0 * b1b0
75		inline void
76		F2N_Multiply_256x256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0,
77		const uint64x2_t& b1, const uint64x2_t& b0, const uint64x2_t& a1, const uint64x2_t& a0)
78		{
79		uint64x2_t c4, c5;
80		uint64x2_t x0=a0, x1=a1, y0=b0, y1=b1;
81
82		F2N_Multiply_128x128_ARMv8(c1, c0, x0, y0);
83		F2N_Multiply_128x128_ARMv8(c3, c2, x1, y1);
84
85		x0 = veorq_u64(x0, x1);
86		y0 = veorq_u64(y0, y1);
87
88		F2N_Multiply_128x128_ARMv8(c5, c4, x0, y0);
89
90		c4 = veorq_u64(c4, c0);
91		c4 = veorq_u64(c4, c2);
92		c5 = veorq_u64(c5, c1);
93		c5 = veorq_u64(c5, c3);
94		c1 = veorq_u64(c1, c4);
95		c2 = veorq_u64(c2, c5);
96		}
97
98		// c3c2c1c0 = a1a0 * a1a0
99		inline void
100		F2N_Square_256_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1,
101		uint64x2_t& c0, const uint64x2_t& a1, const uint64x2_t& a0)
102		{
103		c0 = PMULL_00(a0, a0);
104		c1 = PMULL_11(a0, a0);
105		c2 = PMULL_00(a1, a1);
106		c3 = PMULL_11(a1, a1);
107		}
108
109		// x = (x << n), z = 0
110		template <unsigned int N>
111		inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x)
112		{
113		uint64x2_t u=x, v, z={0};
114		x = vshlq_n_u64(x, N);
115		u = vshrq_n_u64(u, (64-N));
116		v = vcombine_u64(vget_low_u64(z), vget_low_u64(u));
117		x = vorrq_u64(x, v);
118		return x;
119		}
120
121		// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
122		// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
123		inline void
124		GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0)
125		{
126		const unsigned int mask[4] = {
127		0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff,
128		};
129
130		uint64x2_t b3, b2, b1, /b0,/ a1, a0, m0, z0={0};
131		m0 = vreinterpretq_u64_u32(vld1q_u32(mask));
132		b1 = c1; a1 = c1;
133		a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0));
134		a1 = vshlq_n_u64(a1, 23);
135		a1 = vshrq_n_u64(a1, 23);
136		c1 = vorrq_u64(a1, a0);
137		b2 = vshrq_n_u64(c2, (64-23));
138		c3 = ShiftLeft128_ARMv8<23>(c3);
139		a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
140		c3 = vorrq_u64(c3, a0);
141		b1 = vshrq_n_u64(b1, (64-23));
142		c2 = ShiftLeft128_ARMv8<23>(c2);
143		a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0));
144		c2 = vorrq_u64(c2, a0);
145		b3 = c3;
146		b2 = vshrq_n_u64(c2, (64-10));
147		b3 = ShiftLeft128_ARMv8<10>(b3);
148		a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
149		b3 = vorrq_u64(b3, a0);
150		a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0));
151		b3 = veorq_u64(b3, a0);
152		b1 = vshrq_n_u64(b3, (64-23));
153		b3 = ShiftLeft128_ARMv8<23>(b3);
154		b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0));
155		b3 = vorrq_u64(b3, b1);
156		c2 = veorq_u64(c2, b3);
157		b3 = c3;
158		b2 = vshrq_n_u64(c2, (64-10));
159		b3 = ShiftLeft128_ARMv8<10>(b3);
160		b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
161		b3 = vorrq_u64(b3, b2);
162		b2 = c2;
163		b2 = ShiftLeft128_ARMv8<10>(b2);
164		a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2));
165		c2 = veorq_u64(c2, a0);
166		a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3));
167		a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0));
168		a0 = vorrq_u64(a0, a1);
169		c3 = veorq_u64(c3, a0);
170		c0 = veorq_u64(c0, c2);
171		c1 = veorq_u64(c1, c3);
172		c1 = vandq_u64(c1, m0);
173		}
174
175		#endif
176
177		// ************************ SSE ************************ //
178
179		#if (CRYPTOPP_CLMUL_AVAILABLE)
180
181		using CryptoPP::word;
182
183		// c1c0 = a * b
184		inline void
185		F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b)
186	0	{
187	0	__m128i t1, t2;
188
189	0	c0 = _mm_clmulepi64_si128(a, b, 0x00);
190	0	c1 = _mm_clmulepi64_si128(a, b, 0x11);
191	0	t1 = _mm_shuffle_epi32(a, 0xEE);
192	0	t1 = _mm_xor_si128(a, t1);
193	0	t2 = _mm_shuffle_epi32(b, 0xEE);
194	0	t2 = _mm_xor_si128(b, t2);
195	0	t1 = _mm_clmulepi64_si128(t1, t2, 0x00);
196	0	t1 = _mm_xor_si128(c0, t1);
197	0	t1 = _mm_xor_si128(c1, t1);
198	0	t2 = t1;
199	0	t1 = _mm_slli_si128(t1, 8);
200	0	t2 = _mm_srli_si128(t2, 8);
201	0	c0 = _mm_xor_si128(c0, t1);
202	0	c1 = _mm_xor_si128(c1, t2);
203	0	}
204
205		// c3c2c1c0 = a1a0 * b1b0
206		inline void
207		F2N_Multiply_256x256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0,
208		const __m128i& b1, const __m128i& b0, const __m128i& a1, const __m128i& a0)
209	0	{
210	0	__m128i c4, c5;
211	0	__m128i x0=a0, x1=a1, y0=b0, y1=b1;
212
213	0	F2N_Multiply_128x128_CLMUL(c1, c0, x0, y0);
214	0	F2N_Multiply_128x128_CLMUL(c3, c2, x1, y1);
215
216	0	x0 = _mm_xor_si128(x0, x1);
217	0	y0 = _mm_xor_si128(y0, y1);
218
219	0	F2N_Multiply_128x128_CLMUL(c5, c4, x0, y0);
220
221	0	c4 = _mm_xor_si128(c4, c0);
222	0	c4 = _mm_xor_si128(c4, c2);
223	0	c5 = _mm_xor_si128(c5, c1);
224	0	c5 = _mm_xor_si128(c5, c3);
225	0	c1 = _mm_xor_si128(c1, c4);
226	0	c2 = _mm_xor_si128(c2, c5);
227	0	}
228
229		// c3c2c1c0 = a1a0 * a1a0
230		inline void
231		F2N_Square_256_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1,
232		__m128i& c0, const __m128i& a1, const __m128i& a0)
233	0	{
234	0	c0 = _mm_clmulepi64_si128(a0, a0, 0x00);
235	0	c1 = _mm_clmulepi64_si128(a0, a0, 0x11);
236	0	c2 = _mm_clmulepi64_si128(a1, a1, 0x00);
237	0	c3 = _mm_clmulepi64_si128(a1, a1, 0x11);
238	0	}
239
240		// x = (x << n), z = 0
241		template <unsigned int N>
242		inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z)
243	0	{
244	0	__m128i u=x, v;
245	0	x = _mm_slli_epi64(x, N);
246	0	u = _mm_srli_epi64(u, (64-N));
247	0	v = _mm_unpacklo_epi64(z, u);
248	0	x = _mm_or_si128(x, v);
249	0	return x;
250	0	} Unexecuted instantiation: gf2n_simd.cpp:long long __vector(2) (anonymous namespace)::ShiftLeft128_SSE<23u>(long long __vector(2), long long __vector(2) const&) Unexecuted instantiation: gf2n_simd.cpp:long long __vector(2) (anonymous namespace)::ShiftLeft128_SSE<10u>(long long __vector(2), long long __vector(2) const&)
251
252		// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
253		// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
254		inline void
255		GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0)
256	0	{
257	0	const unsigned int m[4] = {
258	0	0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff
259	0	};
260
261	0	__m128i b3, b2, b1, /b0,/ a1, a0, m0, z0;
262	0	m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
263	0	z0 = _mm_setzero_si128();
264	0	b1 = c1; a1 = c1;
265	0	a0 = _mm_move_epi64(c1);
266	0	a1 = _mm_slli_epi64(a1, 23);
267	0	a1 = _mm_srli_epi64(a1, 23);
268	0	c1 = _mm_or_si128(a1, a0);
269	0	b2 = _mm_srli_epi64(c2, (64-23));
270	0	c3 = ShiftLeft128_SSE<23>(c3, z0);
271	0	a0 = _mm_unpackhi_epi64(b2, z0);
272	0	c3 = _mm_or_si128(c3, a0);
273	0	b1 = _mm_srli_epi64(b1, (64-23));
274	0	c2 = ShiftLeft128_SSE<23>(c2, z0);
275	0	a0 = _mm_unpackhi_epi64(b1, z0);
276	0	c2 = _mm_or_si128(c2, a0);
277	0	b3 = c3;
278	0	b2 = _mm_srli_epi64(c2, (64-10));
279	0	b3 = ShiftLeft128_SSE<10>(b3, z0);
280	0	a0 = _mm_unpackhi_epi64(b2, z0);
281	0	b3 = _mm_or_si128(b3, a0);
282	0	a0 = _mm_unpackhi_epi64(c3, z0);
283	0	b3 = _mm_xor_si128(b3, a0);
284	0	b1 = _mm_srli_epi64(b3, (64-23));
285	0	b3 = ShiftLeft128_SSE<23>(b3, z0);
286	0	b3 = _mm_unpackhi_epi64(b3, z0);
287	0	b3 = _mm_or_si128(b3, b1);
288	0	c2 = _mm_xor_si128(c2, b3);
289	0	b3 = c3;
290	0	b2 = _mm_srli_epi64(c2, (64-10));
291	0	b3 = ShiftLeft128_SSE<10>(b3, z0);
292	0	b2 = _mm_unpackhi_epi64(b2, z0);
293	0	b3 = _mm_or_si128(b3, b2);
294	0	b2 = c2;
295	0	b2 = ShiftLeft128_SSE<10>(b2, z0);
296	0	a0 = _mm_unpacklo_epi64(z0, b2);
297	0	c2 = _mm_xor_si128(c2, a0);
298	0	a0 = _mm_unpacklo_epi64(z0, b3);
299	0	a1 = _mm_unpackhi_epi64(b2, z0);
300	0	a0 = _mm_or_si128(a0, a1);
301	0	c3 = _mm_xor_si128(c3, a0);
302	0	c0 = _mm_xor_si128(c0, c2);
303	0	c1 = _mm_xor_si128(c1, c3);
304	0	c1 = _mm_and_si128(c1, m0);
305	0	}
306
307		#endif
308
309		// *********************** Power8 *********************** //
310
311		#if (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
312
313		using CryptoPP::byte;
314		using CryptoPP::word;
315		using CryptoPP::uint8x16_p;
316		using CryptoPP::uint64x2_p;
317
318		using CryptoPP::VecLoad;
319		using CryptoPP::VecStore;
320
321		using CryptoPP::VecOr;
322		using CryptoPP::VecXor;
323		using CryptoPP::VecAnd;
324
325		using CryptoPP::VecPermute;
326		using CryptoPP::VecMergeLow;
327		using CryptoPP::VecMergeHigh;
328		using CryptoPP::VecShiftLeft;
329		using CryptoPP::VecShiftRight;
330
331		using CryptoPP::VecIntelMultiply00;
332		using CryptoPP::VecIntelMultiply11;
333
334		// c1c0 = a * b
335		inline void
336		F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b)
337		{
338		uint64x2_p t1, t2;
339		const uint64x2_p z0={0};
340
341		c0 = VecIntelMultiply00(a, b);
342		c1 = VecIntelMultiply11(a, b);
343		t1 = VecMergeLow(a, a);
344		t1 = VecXor(a, t1);
345		t2 = VecMergeLow(b, b);
346		t2 = VecXor(b, t2);
347		t1 = VecIntelMultiply00(t1, t2);
348		t1 = VecXor(c0, t1);
349		t1 = VecXor(c1, t1);
350		t2 = t1;
351		t1 = VecMergeHigh(z0, t1);
352		t2 = VecMergeLow(t2, z0);
353		c0 = VecXor(c0, t1);
354		c1 = VecXor(c1, t2);
355		}
356
357		// c3c2c1c0 = a1a0 * b1b0
358		inline void
359		F2N_Multiply_256x256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0,
360		const uint64x2_p& b1, const uint64x2_p& b0, const uint64x2_p& a1, const uint64x2_p& a0)
361		{
362		uint64x2_p c4, c5;
363		uint64x2_p x0=a0, x1=a1, y0=b0, y1=b1;
364
365		F2N_Multiply_128x128_POWER8(c1, c0, x0, y0);
366		F2N_Multiply_128x128_POWER8(c3, c2, x1, y1);
367
368		x0 = VecXor(x0, x1);
369		y0 = VecXor(y0, y1);
370
371		F2N_Multiply_128x128_POWER8(c5, c4, x0, y0);
372
373		c4 = VecXor(c4, c0);
374		c4 = VecXor(c4, c2);
375		c5 = VecXor(c5, c1);
376		c5 = VecXor(c5, c3);
377		c1 = VecXor(c1, c4);
378		c2 = VecXor(c2, c5);
379		}
380
381		// c3c2c1c0 = a1a0 * a1a0
382		inline void
383		F2N_Square_256_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1,
384		uint64x2_p& c0, const uint64x2_p& a1, const uint64x2_p& a0)
385		{
386		c0 = VecIntelMultiply00(a0, a0);
387		c1 = VecIntelMultiply11(a0, a0);
388		c2 = VecIntelMultiply00(a1, a1);
389		c3 = VecIntelMultiply11(a1, a1);
390		}
391
392		// x = (x << n), z = 0
393		template <unsigned int N>
394		inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x)
395		{
396		uint64x2_p u=x, v;
397		const uint64x2_p z={0};
398
399		x = VecShiftLeft<N>(x);
400		u = VecShiftRight<64-N>(u);
401		v = VecMergeHigh(z, u);
402		x = VecOr(x, v);
403		return x;
404		}
405
406		// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at
407		// Intel paper or https://github.com/antonblanchard/crc32-vpmsum.
408		inline void
409		GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0)
410		{
411		const uint64_t mod[] = {W64LIT(0xffffffffffffffff), W64LIT(0x01ffffffffff)};
412		const uint64x2_p m0 = (uint64x2_p)VecLoad(mod);
413
414		uint64x2_p b3, b2, b1, /b0,/ a1, a0;
415		const uint64x2_p z0={0};
416
417		b1 = c1; a1 = c1;
418		a0 = VecMergeHigh(c1, z0);
419		a1 = VecShiftLeft<23>(a1);
420		a1 = VecShiftRight<23>(a1);
421		c1 = VecOr(a1, a0);
422		b2 = VecShiftRight<64-23>(c2);
423		c3 = ShiftLeft128_POWER8<23>(c3);
424		a0 = VecMergeLow(b2, z0);
425		c3 = VecOr(c3, a0);
426		b1 = VecShiftRight<64-23>(b1);
427		c2 = ShiftLeft128_POWER8<23>(c2);
428		a0 = VecMergeLow(b1, z0);
429		c2 = VecOr(c2, a0);
430		b3 = c3;
431		b2 = VecShiftRight<64-10>(c2);
432		b3 = ShiftLeft128_POWER8<10>(b3);
433		a0 = VecMergeLow(b2, z0);
434		b3 = VecOr(b3, a0);
435		a0 = VecMergeLow(c3, z0);
436		b3 = VecXor(b3, a0);
437		b1 = VecShiftRight<64-23>(b3);
438		b3 = ShiftLeft128_POWER8<23>(b3);
439		b3 = VecMergeLow(b3, z0);
440		b3 = VecOr(b3, b1);
441		c2 = VecXor(c2, b3);
442		b3 = c3;
443		b2 = VecShiftRight<64-10>(c2);
444		b3 = ShiftLeft128_POWER8<10>(b3);
445		b2 = VecMergeLow(b2, z0);
446		b3 = VecOr(b3, b2);
447		b2 = c2;
448		b2 = ShiftLeft128_POWER8<10>(b2);
449		a0 = VecMergeHigh(z0, b2);
450		c2 = VecXor(c2, a0);
451		a0 = VecMergeHigh(z0, b3);
452		a1 = VecMergeLow(b2, z0);
453		a0 = VecOr(a0, a1);
454		c3 = VecXor(c3, a0);
455		c0 = VecXor(c0, c2);
456		c1 = VecXor(c1, c3);
457		c1 = VecAnd(c1, m0);
458		}
459
460		#endif
461
462		ANONYMOUS_NAMESPACE_END
463
464		NAMESPACE_BEGIN(CryptoPP)
465
466		#if (CRYPTOPP_CLMUL_AVAILABLE)
467
468		void
469		GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC)
470	0	{
471	0	enum {S=sizeof(__m128i)/sizeof(word)};
472	0	__m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i>(pA+0S));
473	0	__m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i>(pA+1S));
474	0	__m128i b0 = _mm_loadu_si128(reinterpret_cast<const __m128i>(pB+0S));
475	0	__m128i b1 = _mm_loadu_si128(reinterpret_cast<const __m128i>(pB+1S));
476
477	0	__m128i c0, c1, c2, c3;
478	0	F2N_Multiply_256x256_CLMUL(c3, c2, c1, c0, a1, a0, b1, b0);
479	0	GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
480
481	0	_mm_storeu_si128(reinterpret_cast<__m128i>(pC+0S), c0);
482	0	_mm_storeu_si128(reinterpret_cast<__m128i>(pC+1S), c1);
483	0	}
484
485		void
486		GF2NT_233_Square_Reduce_CLMUL(const word* pA, word* pC)
487	0	{
488	0	enum {S=sizeof(__m128i)/sizeof(word)};
489	0	__m128i a0 = _mm_loadu_si128(reinterpret_cast<const __m128i>(pA+0S));
490	0	__m128i a1 = _mm_loadu_si128(reinterpret_cast<const __m128i>(pA+1S));
491
492	0	__m128i c0, c1, c2, c3;
493	0	F2N_Square_256_CLMUL(c3, c2, c1, c0, a1, a0);
494	0	GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0);
495
496	0	_mm_storeu_si128(reinterpret_cast<__m128i>(pC+0S), c0);
497	0	_mm_storeu_si128(reinterpret_cast<__m128i>(pC+1S), c1);
498	0	}
499
500		#elif (CRYPTOPP_ARM_PMULL_AVAILABLE)
501
502		void
503		GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC)
504		{
505		// word is either 32-bit or 64-bit, depending on the platform.
506		// Load using a 32-bit pointer to avoid possible alignment issues.
507		const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
508		const uint32_t* pBB = reinterpret_cast<const uint32_t*>(pB);
509
510		uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
511		uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
512		uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0));
513		uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4));
514
515		uint64x2_t c0, c1, c2, c3;
516		F2N_Multiply_256x256_ARMv8(c3, c2, c1, c0, a1, a0, b1, b0);
517		GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
518
519		uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
520		vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
521		vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
522		}
523
524		void
525		GF2NT_233_Square_Reduce_ARMv8(const word* pA, word* pC)
526		{
527		// word is either 32-bit or 64-bit, depending on the platform.
528		// Load using a 32-bit pointer to avoid possible alignment issues.
529		const uint32_t* pAA = reinterpret_cast<const uint32_t*>(pA);
530		uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0));
531		uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4));
532
533		uint64x2_t c0, c1, c2, c3;
534		F2N_Square_256_ARMv8(c3, c2, c1, c0, a1, a0);
535		GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0);
536
537		uint32_t* pCC = reinterpret_cast<uint32_t*>(pC);
538		vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0));
539		vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1));
540		}
541
542		#elif (CRYPTOPP_POWER8_VMULL_AVAILABLE) && 0
543
544		void
545		GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC)
546		{
547		// word is either 32-bit or 64-bit, depending on the platform.
548		// Load using a byte pointer to avoid possible alignment issues.
549		const byte* pAA = reinterpret_cast<const byte*>(pA);
550		const byte* pBB = reinterpret_cast<const byte*>(pB);
551
552		uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
553		uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
554		uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0);
555		uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16);
556
557		#if (CRYPTOPP_BIG_ENDIAN)
558		const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
559		const uint8x16_p m = (uint8x16_p)VecLoad(mb);
560		a0 = VecPermute(a0, m);
561		a1 = VecPermute(a1, m);
562		b0 = VecPermute(b0, m);
563		b1 = VecPermute(b1, m);
564		#endif
565
566		uint64x2_p c0, c1, c2, c3;
567		F2N_Multiply_256x256_POWER8(c3, c2, c1, c0, a1, a0, b1, b0);
568		GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
569
570		#if (CRYPTOPP_BIG_ENDIAN)
571		c0 = VecPermute(c0, m);
572		c1 = VecPermute(c1, m);
573		#endif
574
575		byte* pCC = reinterpret_cast<byte*>(pC);
576		VecStore(c0, pCC+0);
577		VecStore(c1, pCC+16);
578		}
579
580		void
581		GF2NT_233_Square_Reduce_POWER8(const word* pA, word* pC)
582		{
583		// word is either 32-bit or 64-bit, depending on the platform.
584		// Load using a byte pointer to avoid possible alignment issues.
585		const byte* pAA = reinterpret_cast<const byte*>(pA);
586		uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0);
587		uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16);
588
589		#if (CRYPTOPP_BIG_ENDIAN)
590		const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
591		const uint8x16_p m = (uint8x16_p)VecLoad(mb);
592		a0 = VecPermute(a0, m);
593		a1 = VecPermute(a1, m);
594		#endif
595
596		uint64x2_p c0, c1, c2, c3;
597		F2N_Square_256_POWER8(c3, c2, c1, c0, a1, a0);
598		GF2NT_233_Reduce_POWER8(c3, c2, c1, c0);
599
600		#if (CRYPTOPP_BIG_ENDIAN)
601		c0 = VecPermute(c0, m);
602		c1 = VecPermute(c1, m);
603		#endif
604
605		byte* pCC = reinterpret_cast<byte*>(pC);
606		VecStore(c0, pCC+0);
607		VecStore(c1, pCC+16);
608		}
609
610		#endif
611
612		NAMESPACE_END
613
614		#endif // CRYPTOPP_IMPORTS