/src/cryptopp/chacha_avx.cpp

Source (jump to first uncovered line)
// chacha_avx.cpp - written and placed in the public domain by
//                  Jack Lloyd and Jeffrey Walton
//
//    This source file uses intrinsics and built-ins to gain access to
//    AVX2 instructions. A separate source file is needed because
//    additional CXXFLAGS are required to enable the appropriate
//    instructions sets in some build configurations.
//
//    AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
//    to Jack Lloyd and the Botan team for allowing us to use it.
//
//    Here are some relative numbers for ChaCha8:
//    * Intel Skylake,   3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
//    * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
//    * AMD Bulldozer,   3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.

#include "pch.h"
#include "config.h"

#include "chacha.h"
#include "misc.h"

#if defined(CRYPTOPP_AVX2_AVAILABLE)
# include <xmmintrin.h>
# include <emmintrin.h>
# include <immintrin.h>
#endif

// Squash MS LNK4221 and libtool warnings
extern const char CHACHA_AVX_FNAME[] = __FILE__;

// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
# define MAYBE_CONST
#else
# define MAYBE_CONST const
#endif

// VS2017 and global optimization bug. Also see
// https://github.com/weidai11/cryptopp/issues/649 and
// https://github.com/weidai11/cryptopp/issues/735. The
// 649 issue affects AES but it is the same here. The 735
// issue is ChaCha AVX2 cut-in where it surfaced again.
#if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)
# ifndef CRYPTOPP_DEBUG
#  pragma optimize("", off)
#  pragma optimize("ts", on)
# endif
#endif

// The data is aligned, but Clang issues warning based on type
// and not the actual alignment of the variable and data.
#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
# pragma GCC diagnostic ignored "-Wcast-align"
#endif

ANONYMOUS_NAMESPACE_BEGIN

#if (CRYPTOPP_AVX2_AVAILABLE)

template <unsigned int R>
inline __m256i RotateLeft(const __m256i val)
{
    return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
}

template <>
inline __m256i RotateLeft<8>(const __m256i val)
{
    const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
                                         14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
    return _mm256_shuffle_epi8(val, mask);
}

template <>
inline __m256i RotateLeft<16>(const __m256i val)
{
    const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
                                         13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
    return _mm256_shuffle_epi8(val, mask);
}

#endif  // CRYPTOPP_AVX2_AVAILABLE

ANONYMOUS_NAMESPACE_END

NAMESPACE_BEGIN(CryptoPP)

#if (CRYPTOPP_AVX2_AVAILABLE)

void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
    const __m256i state0 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
    const __m256i state1 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
    const __m256i state2 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
    const __m256i state3 = _mm256_broadcastsi128_si256(
        _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));

    const word32 C = 0xFFFFFFFFu - state[12];
    const __m256i CTR0 = _mm256_set_epi32(0, 0,     0, 0, 0, 0, C < 4, 4);
    const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
    const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
    const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);

    __m256i X0_0 = state0;
    __m256i X0_1 = state1;
    __m256i X0_2 = state2;
    __m256i X0_3 = _mm256_add_epi32(state3, CTR0);

    __m256i X1_0 = state0;
    __m256i X1_1 = state1;
    __m256i X1_2 = state2;
    __m256i X1_3 = _mm256_add_epi32(state3, CTR1);

    __m256i X2_0 = state0;
    __m256i X2_1 = state1;
    __m256i X2_2 = state2;
    __m256i X2_3 = _mm256_add_epi32(state3, CTR2);

    __m256i X3_0 = state0;
    __m256i X3_1 = state1;
    __m256i X3_2 = state2;
    __m256i X3_3 = _mm256_add_epi32(state3, CTR3);

    for (int i = static_cast<int>(rounds); i > 0; i -= 2)
    {
        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<16>(X0_3);
        X1_3 = RotateLeft<16>(X1_3);
        X2_3 = RotateLeft<16>(X2_3);
        X3_3 = RotateLeft<16>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<12>(X0_1);
        X1_1 = RotateLeft<12>(X1_1);
        X2_1 = RotateLeft<12>(X2_1);
        X3_1 = RotateLeft<12>(X3_1);

        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<8>(X0_3);
        X1_3 = RotateLeft<8>(X1_3);
        X2_3 = RotateLeft<8>(X2_3);
        X3_3 = RotateLeft<8>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<7>(X0_1);
        X1_1 = RotateLeft<7>(X1_1);
        X2_1 = RotateLeft<7>(X2_1);
        X3_1 = RotateLeft<7>(X3_1);

        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));

        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));

        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));

        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));

        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<16>(X0_3);
        X1_3 = RotateLeft<16>(X1_3);
        X2_3 = RotateLeft<16>(X2_3);
        X3_3 = RotateLeft<16>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<12>(X0_1);
        X1_1 = RotateLeft<12>(X1_1);
        X2_1 = RotateLeft<12>(X2_1);
        X3_1 = RotateLeft<12>(X3_1);

        X0_0 = _mm256_add_epi32(X0_0, X0_1);
        X1_0 = _mm256_add_epi32(X1_0, X1_1);
        X2_0 = _mm256_add_epi32(X2_0, X2_1);
        X3_0 = _mm256_add_epi32(X3_0, X3_1);

        X0_3 = _mm256_xor_si256(X0_3, X0_0);
        X1_3 = _mm256_xor_si256(X1_3, X1_0);
        X2_3 = _mm256_xor_si256(X2_3, X2_0);
        X3_3 = _mm256_xor_si256(X3_3, X3_0);

        X0_3 = RotateLeft<8>(X0_3);
        X1_3 = RotateLeft<8>(X1_3);
        X2_3 = RotateLeft<8>(X2_3);
        X3_3 = RotateLeft<8>(X3_3);

        X0_2 = _mm256_add_epi32(X0_2, X0_3);
        X1_2 = _mm256_add_epi32(X1_2, X1_3);
        X2_2 = _mm256_add_epi32(X2_2, X2_3);
        X3_2 = _mm256_add_epi32(X3_2, X3_3);

        X0_1 = _mm256_xor_si256(X0_1, X0_2);
        X1_1 = _mm256_xor_si256(X1_1, X1_2);
        X2_1 = _mm256_xor_si256(X2_1, X2_2);
        X3_1 = _mm256_xor_si256(X3_1, X3_2);

        X0_1 = RotateLeft<7>(X0_1);
        X1_1 = RotateLeft<7>(X1_1);
        X2_1 = RotateLeft<7>(X2_1);
        X3_1 = RotateLeft<7>(X3_1);

        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));

        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));

        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));

        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
    }

    X0_0 = _mm256_add_epi32(X0_0, state0);
    X0_1 = _mm256_add_epi32(X0_1, state1);
    X0_2 = _mm256_add_epi32(X0_2, state2);
    X0_3 = _mm256_add_epi32(X0_3, state3);
    X0_3 = _mm256_add_epi32(X0_3, CTR0);

    X1_0 = _mm256_add_epi32(X1_0, state0);
    X1_1 = _mm256_add_epi32(X1_1, state1);
    X1_2 = _mm256_add_epi32(X1_2, state2);
    X1_3 = _mm256_add_epi32(X1_3, state3);
    X1_3 = _mm256_add_epi32(X1_3, CTR1);

    X2_0 = _mm256_add_epi32(X2_0, state0);
    X2_1 = _mm256_add_epi32(X2_1, state1);
    X2_2 = _mm256_add_epi32(X2_2, state2);
    X2_3 = _mm256_add_epi32(X2_3, state3);
    X2_3 = _mm256_add_epi32(X2_3, CTR2);

    X3_0 = _mm256_add_epi32(X3_0, state0);
    X3_1 = _mm256_add_epi32(X3_1, state1);
    X3_2 = _mm256_add_epi32(X3_2, state2);
    X3_3 = _mm256_add_epi32(X3_3, state3);
    X3_3 = _mm256_add_epi32(X3_3, CTR3);

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
            _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
            _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
            _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
            _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
    }

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
            _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
            _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
            _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
            _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
    }

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
            _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
            _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
            _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
            _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
    }

    if (input)
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
            _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
            _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));
    }
    else
    {
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
            _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
            _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
            _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
            _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
    }

    // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
    _mm256_zeroupper();
}

#endif  // CRYPTOPP_AVX2_AVAILABLE

NAMESPACE_END

Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		// chacha_avx.cpp - written and placed in the public domain by
2		// Jack Lloyd and Jeffrey Walton
3		//
4		// This source file uses intrinsics and built-ins to gain access to
5		// AVX2 instructions. A separate source file is needed because
6		// additional CXXFLAGS are required to enable the appropriate
7		// instructions sets in some build configurations.
8		//
9		// AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
10		// to Jack Lloyd and the Botan team for allowing us to use it.
11		//
12		// Here are some relative numbers for ChaCha8:
13		// * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
14		// * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
15		// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
16
17		#include "pch.h"
18		#include "config.h"
19
20		#include "chacha.h"
21		#include "misc.h"
22
23		#if defined(CRYPTOPP_AVX2_AVAILABLE)
24		# include <xmmintrin.h>
25		# include <emmintrin.h>
26		# include <immintrin.h>
27		#endif
28
29		// Squash MS LNK4221 and libtool warnings
30		extern const char CHACHA_AVX_FNAME[] = __FILE__;
31
32		// Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
33		#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
34		# define MAYBE_CONST
35		#else
36		# define MAYBE_CONST const
37		#endif
38
39		// VS2017 and global optimization bug. Also see
40		// https://github.com/weidai11/cryptopp/issues/649 and
41		// https://github.com/weidai11/cryptopp/issues/735. The
42		// 649 issue affects AES but it is the same here. The 735
43		// issue is ChaCha AVX2 cut-in where it surfaced again.
44		#if (CRYPTOPP_MSC_VERSION >= 1910) && (CRYPTOPP_MSC_VERSION <= 1916)
45		# ifndef CRYPTOPP_DEBUG
46		# pragma optimize("", off)
47		# pragma optimize("ts", on)
48		# endif
49		#endif
50
51		// The data is aligned, but Clang issues warning based on type
52		// and not the actual alignment of the variable and data.
53		#if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
54		# pragma GCC diagnostic ignored "-Wcast-align"
55		#endif
56
57		ANONYMOUS_NAMESPACE_BEGIN
58
59		#if (CRYPTOPP_AVX2_AVAILABLE)
60
61		template <unsigned int R>
62		inline __m256i RotateLeft(const __m256i val)
63	0	{
64	0	return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
65	0	} Unexecuted instantiation: chacha_avx.cpp:long long __vector(4) (anonymous namespace)::RotateLeft<12u>(long long __vector(4)) Unexecuted instantiation: chacha_avx.cpp:long long __vector(4) (anonymous namespace)::RotateLeft<7u>(long long __vector(4))
66
67		template <>
68		inline __m256i RotateLeft<8>(const __m256i val)
69	0	{
70	0	const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
71	0	14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
72	0	return _mm256_shuffle_epi8(val, mask);
73	0	}
74
75		template <>
76		inline __m256i RotateLeft<16>(const __m256i val)
77	0	{
78	0	const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
79	0	13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
80	0	return _mm256_shuffle_epi8(val, mask);
81	0	}
82
83		#endif // CRYPTOPP_AVX2_AVAILABLE
84
85		ANONYMOUS_NAMESPACE_END
86
87		NAMESPACE_BEGIN(CryptoPP)
88
89		#if (CRYPTOPP_AVX2_AVAILABLE)
90
91		void ChaCha_OperateKeystream_AVX2(const word32 state, const byte input, byte *output, unsigned int rounds)
92	0	{
93	0	const __m256i state0 = _mm256_broadcastsi128_si256(
94	0	_mm_loadu_si128(reinterpret_cast<const __m128i>(state+04)));
95	0	const __m256i state1 = _mm256_broadcastsi128_si256(
96	0	_mm_loadu_si128(reinterpret_cast<const __m128i>(state+14)));
97	0	const __m256i state2 = _mm256_broadcastsi128_si256(
98	0	_mm_loadu_si128(reinterpret_cast<const __m128i>(state+24)));
99	0	const __m256i state3 = _mm256_broadcastsi128_si256(
100	0	_mm_loadu_si128(reinterpret_cast<const __m128i>(state+34)));
101
102	0	const word32 C = 0xFFFFFFFFu - state[12];
103	0	const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4);
104	0	const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
105	0	const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
106	0	const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
107
108	0	__m256i X0_0 = state0;
109	0	__m256i X0_1 = state1;
110	0	__m256i X0_2 = state2;
111	0	__m256i X0_3 = _mm256_add_epi32(state3, CTR0);
112
113	0	__m256i X1_0 = state0;
114	0	__m256i X1_1 = state1;
115	0	__m256i X1_2 = state2;
116	0	__m256i X1_3 = _mm256_add_epi32(state3, CTR1);
117
118	0	__m256i X2_0 = state0;
119	0	__m256i X2_1 = state1;
120	0	__m256i X2_2 = state2;
121	0	__m256i X2_3 = _mm256_add_epi32(state3, CTR2);
122
123	0	__m256i X3_0 = state0;
124	0	__m256i X3_1 = state1;
125	0	__m256i X3_2 = state2;
126	0	__m256i X3_3 = _mm256_add_epi32(state3, CTR3);
127
128	0	for (int i = static_cast<int>(rounds); i > 0; i -= 2)
129	0	{
130	0	X0_0 = _mm256_add_epi32(X0_0, X0_1);
131	0	X1_0 = _mm256_add_epi32(X1_0, X1_1);
132	0	X2_0 = _mm256_add_epi32(X2_0, X2_1);
133	0	X3_0 = _mm256_add_epi32(X3_0, X3_1);
134
135	0	X0_3 = _mm256_xor_si256(X0_3, X0_0);
136	0	X1_3 = _mm256_xor_si256(X1_3, X1_0);
137	0	X2_3 = _mm256_xor_si256(X2_3, X2_0);
138	0	X3_3 = _mm256_xor_si256(X3_3, X3_0);
139
140	0	X0_3 = RotateLeft<16>(X0_3);
141	0	X1_3 = RotateLeft<16>(X1_3);
142	0	X2_3 = RotateLeft<16>(X2_3);
143	0	X3_3 = RotateLeft<16>(X3_3);
144
145	0	X0_2 = _mm256_add_epi32(X0_2, X0_3);
146	0	X1_2 = _mm256_add_epi32(X1_2, X1_3);
147	0	X2_2 = _mm256_add_epi32(X2_2, X2_3);
148	0	X3_2 = _mm256_add_epi32(X3_2, X3_3);
149
150	0	X0_1 = _mm256_xor_si256(X0_1, X0_2);
151	0	X1_1 = _mm256_xor_si256(X1_1, X1_2);
152	0	X2_1 = _mm256_xor_si256(X2_1, X2_2);
153	0	X3_1 = _mm256_xor_si256(X3_1, X3_2);
154
155	0	X0_1 = RotateLeft<12>(X0_1);
156	0	X1_1 = RotateLeft<12>(X1_1);
157	0	X2_1 = RotateLeft<12>(X2_1);
158	0	X3_1 = RotateLeft<12>(X3_1);
159
160	0	X0_0 = _mm256_add_epi32(X0_0, X0_1);
161	0	X1_0 = _mm256_add_epi32(X1_0, X1_1);
162	0	X2_0 = _mm256_add_epi32(X2_0, X2_1);
163	0	X3_0 = _mm256_add_epi32(X3_0, X3_1);
164
165	0	X0_3 = _mm256_xor_si256(X0_3, X0_0);
166	0	X1_3 = _mm256_xor_si256(X1_3, X1_0);
167	0	X2_3 = _mm256_xor_si256(X2_3, X2_0);
168	0	X3_3 = _mm256_xor_si256(X3_3, X3_0);
169
170	0	X0_3 = RotateLeft<8>(X0_3);
171	0	X1_3 = RotateLeft<8>(X1_3);
172	0	X2_3 = RotateLeft<8>(X2_3);
173	0	X3_3 = RotateLeft<8>(X3_3);
174
175	0	X0_2 = _mm256_add_epi32(X0_2, X0_3);
176	0	X1_2 = _mm256_add_epi32(X1_2, X1_3);
177	0	X2_2 = _mm256_add_epi32(X2_2, X2_3);
178	0	X3_2 = _mm256_add_epi32(X3_2, X3_3);
179
180	0	X0_1 = _mm256_xor_si256(X0_1, X0_2);
181	0	X1_1 = _mm256_xor_si256(X1_1, X1_2);
182	0	X2_1 = _mm256_xor_si256(X2_1, X2_2);
183	0	X3_1 = _mm256_xor_si256(X3_1, X3_2);
184
185	0	X0_1 = RotateLeft<7>(X0_1);
186	0	X1_1 = RotateLeft<7>(X1_1);
187	0	X2_1 = RotateLeft<7>(X2_1);
188	0	X3_1 = RotateLeft<7>(X3_1);
189
190	0	X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
191	0	X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
192	0	X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
193
194	0	X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
195	0	X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
196	0	X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
197
198	0	X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
199	0	X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
200	0	X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
201
202	0	X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
203	0	X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
204	0	X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
205
206	0	X0_0 = _mm256_add_epi32(X0_0, X0_1);
207	0	X1_0 = _mm256_add_epi32(X1_0, X1_1);
208	0	X2_0 = _mm256_add_epi32(X2_0, X2_1);
209	0	X3_0 = _mm256_add_epi32(X3_0, X3_1);
210
211	0	X0_3 = _mm256_xor_si256(X0_3, X0_0);
212	0	X1_3 = _mm256_xor_si256(X1_3, X1_0);
213	0	X2_3 = _mm256_xor_si256(X2_3, X2_0);
214	0	X3_3 = _mm256_xor_si256(X3_3, X3_0);
215
216	0	X0_3 = RotateLeft<16>(X0_3);
217	0	X1_3 = RotateLeft<16>(X1_3);
218	0	X2_3 = RotateLeft<16>(X2_3);
219	0	X3_3 = RotateLeft<16>(X3_3);
220
221	0	X0_2 = _mm256_add_epi32(X0_2, X0_3);
222	0	X1_2 = _mm256_add_epi32(X1_2, X1_3);
223	0	X2_2 = _mm256_add_epi32(X2_2, X2_3);
224	0	X3_2 = _mm256_add_epi32(X3_2, X3_3);
225
226	0	X0_1 = _mm256_xor_si256(X0_1, X0_2);
227	0	X1_1 = _mm256_xor_si256(X1_1, X1_2);
228	0	X2_1 = _mm256_xor_si256(X2_1, X2_2);
229	0	X3_1 = _mm256_xor_si256(X3_1, X3_2);
230
231	0	X0_1 = RotateLeft<12>(X0_1);
232	0	X1_1 = RotateLeft<12>(X1_1);
233	0	X2_1 = RotateLeft<12>(X2_1);
234	0	X3_1 = RotateLeft<12>(X3_1);
235
236	0	X0_0 = _mm256_add_epi32(X0_0, X0_1);
237	0	X1_0 = _mm256_add_epi32(X1_0, X1_1);
238	0	X2_0 = _mm256_add_epi32(X2_0, X2_1);
239	0	X3_0 = _mm256_add_epi32(X3_0, X3_1);
240
241	0	X0_3 = _mm256_xor_si256(X0_3, X0_0);
242	0	X1_3 = _mm256_xor_si256(X1_3, X1_0);
243	0	X2_3 = _mm256_xor_si256(X2_3, X2_0);
244	0	X3_3 = _mm256_xor_si256(X3_3, X3_0);
245
246	0	X0_3 = RotateLeft<8>(X0_3);
247	0	X1_3 = RotateLeft<8>(X1_3);
248	0	X2_3 = RotateLeft<8>(X2_3);
249	0	X3_3 = RotateLeft<8>(X3_3);
250
251	0	X0_2 = _mm256_add_epi32(X0_2, X0_3);
252	0	X1_2 = _mm256_add_epi32(X1_2, X1_3);
253	0	X2_2 = _mm256_add_epi32(X2_2, X2_3);
254	0	X3_2 = _mm256_add_epi32(X3_2, X3_3);
255
256	0	X0_1 = _mm256_xor_si256(X0_1, X0_2);
257	0	X1_1 = _mm256_xor_si256(X1_1, X1_2);
258	0	X2_1 = _mm256_xor_si256(X2_1, X2_2);
259	0	X3_1 = _mm256_xor_si256(X3_1, X3_2);
260
261	0	X0_1 = RotateLeft<7>(X0_1);
262	0	X1_1 = RotateLeft<7>(X1_1);
263	0	X2_1 = RotateLeft<7>(X2_1);
264	0	X3_1 = RotateLeft<7>(X3_1);
265
266	0	X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
267	0	X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
268	0	X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
269
270	0	X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
271	0	X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
272	0	X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
273
274	0	X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
275	0	X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
276	0	X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
277
278	0	X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
279	0	X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
280	0	X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
281	0	}
282
283	0	X0_0 = _mm256_add_epi32(X0_0, state0);
284	0	X0_1 = _mm256_add_epi32(X0_1, state1);
285	0	X0_2 = _mm256_add_epi32(X0_2, state2);
286	0	X0_3 = _mm256_add_epi32(X0_3, state3);
287	0	X0_3 = _mm256_add_epi32(X0_3, CTR0);
288
289	0	X1_0 = _mm256_add_epi32(X1_0, state0);
290	0	X1_1 = _mm256_add_epi32(X1_1, state1);
291	0	X1_2 = _mm256_add_epi32(X1_2, state2);
292	0	X1_3 = _mm256_add_epi32(X1_3, state3);
293	0	X1_3 = _mm256_add_epi32(X1_3, CTR1);
294
295	0	X2_0 = _mm256_add_epi32(X2_0, state0);
296	0	X2_1 = _mm256_add_epi32(X2_1, state1);
297	0	X2_2 = _mm256_add_epi32(X2_2, state2);
298	0	X2_3 = _mm256_add_epi32(X2_3, state3);
299	0	X2_3 = _mm256_add_epi32(X2_3, CTR2);
300
301	0	X3_0 = _mm256_add_epi32(X3_0, state0);
302	0	X3_1 = _mm256_add_epi32(X3_1, state1);
303	0	X3_2 = _mm256_add_epi32(X3_2, state2);
304	0	X3_3 = _mm256_add_epi32(X3_3, state3);
305	0	X3_3 = _mm256_add_epi32(X3_3, CTR3);
306
307	0	if (input)
308	0	{
309	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+032),
310	0	_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
311	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+0*32)))));
312	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+132),
313	0	_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
314	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+1*32)))));
315	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+232),
316	0	_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
317	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+2*32)))));
318	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+332),
319	0	_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
320	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+3*32)))));
321	0	}
322	0	else
323	0	{
324	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+032),
325	0	_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
326	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+132),
327	0	_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
328	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+232),
329	0	_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
330	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+332),
331	0	_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
332	0	}
333
334	0	if (input)
335	0	{
336	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+432),
337	0	_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
338	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+4*32)))));
339	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+532),
340	0	_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
341	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+5*32)))));
342	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+632),
343	0	_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
344	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+6*32)))));
345	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+732),
346	0	_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
347	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+7*32)))));
348	0	}
349	0	else
350	0	{
351	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+432),
352	0	_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
353	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+532),
354	0	_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
355	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+632),
356	0	_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
357	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+732),
358	0	_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
359	0	}
360
361	0	if (input)
362	0	{
363	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 832),
364	0	_mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
365	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+8*32)))));
366	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 932),
367	0	_mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
368	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+9*32)))));
369	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1032),
370	0	_mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
371	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+10*32)))));
372	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1132),
373	0	_mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
374	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+11*32)))));
375	0	}
376	0	else
377	0	{
378	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 832),
379	0	_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
380	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+ 932),
381	0	_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
382	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1032),
383	0	_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
384	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1132),
385	0	_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
386	0	}
387
388	0	if (input)
389	0	{
390	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1232),
391	0	_mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
392	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+12*32)))));
393	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1332),
394	0	_mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
395	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+13*32)))));
396	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1432),
397	0	_mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
398	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+14*32)))));
399	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1532),
400	0	_mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
401	0	_mm256_loadu_si256(const_cast<MAYBE_CONST __m256i>(reinterpret_cast<const __m256i>(input+15*32)))));
402	0	}
403	0	else
404	0	{
405	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1232),
406	0	_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
407	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1332),
408	0	_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
409	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1432),
410	0	_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
411	0	_mm256_storeu_si256(reinterpret_cast<__m256i>(output+1532),
412	0	_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
413	0	}
414
415		// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
416	0	_mm256_zeroupper();
417	0	}
418
419		#endif // CRYPTOPP_AVX2_AVAILABLE
420
421		NAMESPACE_END