/src/cryptopp/xts.cpp

Source (jump to first uncovered line)
// xts.cpp - written and placed in the public domain by Jeffrey Walton

// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
// base architecture. We can use the SIMD code below without an
// architecture option. No runtime tests are required. Unfortunately,
// we can't use it on Altivec because an architecture switch is required.
// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
// 16-byte block sizes.

#include "pch.h"

#include "xts.h"
#include "misc.h"
#include "modes.h"
#include "cpu.h"

#if defined(CRYPTOPP_DEBUG)
# include "aes.h"
# include "threefish.h"
#endif

// 0.3 to 0.4 cpb profit
#if defined(__SSE2__) || defined(_M_X64)
# include <emmintrin.h>
#endif

#if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
# if (CRYPTOPP_ARM_NEON_HEADER) || (CRYPTOPP_ARM_ASIMD_AVAILABLE)
#  include <arm_neon.h>
# endif
#endif

#if defined(__ALTIVEC__)
# include "ppc_simd.h"
#endif

ANONYMOUS_NAMESPACE_BEGIN

using namespace CryptoPP;

#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)

using CryptoPP::AES;
using CryptoPP::XTS_Mode;
using CryptoPP::Threefish512;

void Modes_TestInstantiations()
{
    XTS_Mode<AES>::Encryption m0;
    XTS_Mode<AES>::Decryption m1;
    XTS_Mode<AES>::Encryption m2;
    XTS_Mode<AES>::Decryption m3;

#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
    XTS_Mode<Threefish512>::Encryption m4;
    XTS_Mode<Threefish512>::Decryption m5;
#endif
}
#endif  // CRYPTOPP_DEBUG

inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
{
    CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));

#if defined(CRYPTOPP_DISABLE_ASM)
    xorbuf(output, input, mask, count);

#elif defined(__SSE2__) || defined(_M_X64)
    for (size_t i=0; i<count; i+=16)
        _mm_storeu_si128(M128_CAST(output+i),
            _mm_xor_si128(
                _mm_loadu_si128(CONST_M128_CAST(input+i)),
                _mm_loadu_si128(CONST_M128_CAST(mask+i))));

#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
    for (size_t i=0; i<count; i+=16)
        vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));

#elif defined(__ALTIVEC__)
    for (size_t i=0; i<count; i+=16)
        VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);

#else
    xorbuf(output, input, mask, count);
#endif
}

inline void XorBuffer(byte *buf, const byte *mask, size_t count)
{
    XorBuffer(buf, buf, mask, count);
}

// Borrowed from CMAC, but little-endian representation
inline void GF_Double(byte *out, const byte* in, unsigned int len)
{
#if defined(CRYPTOPP_WORD128_AVAILABLE)
    word128 carry = 0, x;
    for (size_t i=0, idx=0; i<len/16; ++i, idx+=16)
    {
        x = GetWord<word128>(false, LITTLE_ENDIAN_ORDER, in+idx);
        word128 y = (x >> 127); x = (x << 1) + carry;
        PutWord<word128>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
        carry = y;
    }
#elif defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
    word64 carry = 0, x;
    for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
    {
        x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
        word64 y = (x >> 63); x = (x << 1) + carry;
        PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
        carry = y;
    }
#else
    word32 carry = 0, x;
    for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
    {
        x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
        word32 y = (x >> 31); x = (x << 1) + carry;
        PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
        carry = y;
    }
#endif

#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS

    CRYPTOPP_ASSERT(IsPowerOf2(len));
    CRYPTOPP_ASSERT(len >= 16);
    CRYPTOPP_ASSERT(len <= 128);

    byte* k = out;
    if (carry)
    {
        switch (len)
        {
        case 16:
        {
            const size_t LEIDX = 16-1;
            k[LEIDX-15] ^= 0x87;
            break;
        }
        case 32:
        {
            // https://crypto.stackexchange.com/q/9815/10496
            // Polynomial x^256 + x^10 + x^5 + x^2 + 1
            const size_t LEIDX = 32-1;
            k[LEIDX-30] ^= 4;
            k[LEIDX-31] ^= 0x25;
            break;
        }
        case 64:
        {
            // https://crypto.stackexchange.com/q/9815/10496
            // Polynomial x^512 + x^8 + x^5 + x^2 + 1
            const size_t LEIDX = 64-1;
            k[LEIDX-62] ^= 1;
            k[LEIDX-63] ^= 0x25;
            break;
        }
        case 128:
        {
            // https://crypto.stackexchange.com/q/9815/10496
            // Polynomial x^1024 + x^19 + x^6 + x + 1
            const size_t LEIDX = 128-1;
            k[LEIDX-125] ^= 8;
            k[LEIDX-126] ^= 0x00;
            k[LEIDX-127] ^= 0x43;
            break;
        }
        default:
            CRYPTOPP_ASSERT(0);
        }
    }
#else
    CRYPTOPP_ASSERT(len == 16);

    byte* k = out;
    if (carry)
    {
        k[0] ^= 0x87;
        return;
    }
#endif  // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
}

inline void GF_Double(byte *inout, unsigned int len)
{
    GF_Double(inout, inout, len);
}

ANONYMOUS_NAMESPACE_END

NAMESPACE_BEGIN(CryptoPP)

void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
{
#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
    CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
    if (length < 16 || length > 128 || !IsPowerOf2(length))
        throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
#else
    CRYPTOPP_ASSERT(length == 16);
    if (length != 16)
        throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
#endif
}

void XTS_ModeBase::ThrowIfInvalidKeyLength(size_t length)
{
    CRYPTOPP_ASSERT(length % 2 == 0);
    if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
        throw InvalidKeyLength(AlgorithmName(), length);
}

void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
{
    ThrowIfInvalidKeyLength(length);
    ThrowIfInvalidBlockSize(BlockSize());

    const size_t klen = length/2;
    AccessBlockCipher().SetKey(key+0, klen, params);
    AccessTweakCipher().SetKey(key+klen, klen, params);

    ResizeBuffers();

    size_t ivLength;
    const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
    Resynchronize(iv, (int)ivLength);
}

void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
{
    BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
    std::memcpy(m_xregister, m_register, ivLength);
    GetTweakCipher().ProcessBlock(m_xregister);
}

void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
{
    SecByteBlock iv(GetTweakCipher().BlockSize());
    PutWord<word64>(false, order, iv, sector);
    std::memset(iv+8, 0x00, iv.size()-8);

    BlockOrientedCipherModeBase::Resynchronize(iv, (int)iv.size());
    std::memcpy(m_xregister, iv, iv.size());
    GetTweakCipher().ProcessBlock(m_xregister);
}

void XTS_ModeBase::ResizeBuffers()
{
    BlockOrientedCipherModeBase::ResizeBuffers();
    m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
    m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
}

// ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
// selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
// can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
// of registers. The unneeded code paths should be removed by optimizer.
// The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
{
    // data unit is multiple of 16 bytes
    CRYPTOPP_ASSERT(length % BlockSize() == 0);

    enum { lastParallelBlock = ParallelBlocks-1 };
    const unsigned int blockSize = GetBlockCipher().BlockSize();
    const size_t parallelSize = blockSize*ParallelBlocks;

    // encrypt the data unit, optimal size at a time
    while (length >= parallelSize)
    {
        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
        GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
        GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);

        if (ParallelBlocks > 4)
        {
            GF_Double(m_xregister+4*blockSize, m_xregister+3*blockSize, blockSize);
            GF_Double(m_xregister+5*blockSize, m_xregister+4*blockSize, blockSize);
            GF_Double(m_xregister+6*blockSize, m_xregister+5*blockSize, blockSize);
            GF_Double(m_xregister+7*blockSize, m_xregister+6*blockSize, blockSize);
        }
        if (ParallelBlocks > 8)
        {
            GF_Double(m_xregister+8*blockSize, m_xregister+7*blockSize, blockSize);
            GF_Double(m_xregister+9*blockSize, m_xregister+8*blockSize, blockSize);
            GF_Double(m_xregister+10*blockSize, m_xregister+9*blockSize, blockSize);
            GF_Double(m_xregister+11*blockSize, m_xregister+10*blockSize, blockSize);
        }

        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);

        // encrypt one block, merge the tweak into the output block
        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
            outString, parallelSize, BlockTransformation::BT_AllowParallel);

        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);

        inString += parallelSize;
        outString += parallelSize;
        length -= parallelSize;
    }

    // encrypt the data unit, 4 blocks at a time
    while (ParallelBlocks == 12 && length >= blockSize*4)
    {
        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
        GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
        GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);

        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);

        // encrypt one block, merge the tweak into the output block
        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
            outString, blockSize*4, BlockTransformation::BT_AllowParallel);

        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);

        inString += blockSize*4;
        outString += blockSize*4;
        length -= blockSize*4;
    }

    // encrypt the data unit, 2 blocks at a time
    while (ParallelBlocks == 8 && length >= blockSize*2)
    {
        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);

        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);

        // encrypt one block, merge the tweak into the output block
        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
            outString, blockSize*2, BlockTransformation::BT_AllowParallel);

        // m_xregister[0] always points to the next tweak.
        GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);

        inString += blockSize*2;
        outString += blockSize*2;
        length -= blockSize*2;
    }

    // encrypt the data unit, blocksize at a time
    while (length)
    {
        // merge the tweak into the input block
        XorBuffer(m_xworkspace, inString, m_xregister, blockSize);

        // encrypt one block
        GetBlockCipher().ProcessBlock(m_xworkspace);

        // merge the tweak into the output block
        XorBuffer(outString, m_xworkspace, m_xregister, blockSize);

        // Multiply T by alpha
        GF_Double(m_xregister, blockSize);

        inString += blockSize;
        outString += blockSize;
        length -= blockSize;
    }
}

size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
{
    // need at least a full AES block
    CRYPTOPP_ASSERT(inLength >= BlockSize());

    if (inLength < BlockSize())
        throw InvalidArgument("XTS: message is too short for ciphertext stealing");

    if (IsForwardTransformation())
        return ProcessLastPlainBlock(outString, outLength, inString, inLength);
    else
        return ProcessLastCipherBlock(outString, outLength, inString, inLength);
}

size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
{
    // ensure output buffer is large enough
    CRYPTOPP_ASSERT(outLength >= inLength);

    const unsigned int blockSize = GetBlockCipher().BlockSize();
    const size_t blocks = inLength / blockSize;
    const size_t tail = inLength % blockSize;
    outLength = inLength;

    if (tail == 0)
    {
        // Allow ProcessData to handle all the full blocks
        ProcessData(outString, inString, inLength);
        return inLength;
    }
    else if (blocks > 1)
    {
        // Allow ProcessData to handle full blocks except one
        const size_t head = (blocks-1)*blockSize;
        ProcessData(outString, inString, inLength-head);

        outString += head;
        inString  += head; inLength -= head;
    }

    ///// handle the full block /////

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, inString, m_xregister, blockSize);

    // encrypt one block
    GetBlockCipher().ProcessBlock(m_xworkspace);

    // merge the tweak into the output block
    XorBuffer(outString, m_xworkspace, m_xregister, blockSize);

    // Multiply T by alpha
    GF_Double(m_xregister, blockSize);

    ///// handle final partial block /////

    inString += blockSize;
    outString += blockSize;
    const size_t len = inLength-blockSize;

    // copy in the final plaintext bytes
    std::memcpy(m_xworkspace, inString, len);
    // and copy out the final ciphertext bytes
    std::memcpy(outString, outString-blockSize, len);
    // "steal" ciphertext to complete the block
    std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, m_xregister, blockSize);

    // encrypt one block
    GetBlockCipher().ProcessBlock(m_xworkspace);

    // merge the tweak into the previous output block
    XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);

    return outLength;
}

size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
{
    // ensure output buffer is large enough
    CRYPTOPP_ASSERT(outLength >= inLength);

    const unsigned int blockSize = GetBlockCipher().BlockSize();
    const size_t blocks = inLength / blockSize;
    const size_t tail = inLength % blockSize;
    outLength = inLength;

    if (tail == 0)
    {
        // Allow ProcessData to handle all the full blocks
        ProcessData(outString, inString, inLength);
        return inLength;
    }
    else if (blocks > 1)
    {
        // Allow ProcessData to handle full blocks except one
        const size_t head = (blocks-1)*blockSize;
        ProcessData(outString, inString, inLength-head);

        outString += head;
        inString  += head; inLength -= head;
    }

    #define poly1 (m_xregister+0*blockSize)
    #define poly2 (m_xregister+1*blockSize)
    GF_Double(poly2, poly1, blockSize);

    ///// handle final partial block /////

    inString += blockSize;
    outString += blockSize;
    const size_t len = inLength-blockSize;

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);

    // encrypt one block
    GetBlockCipher().ProcessBlock(m_xworkspace);

    // merge the tweak into the output block
    XorBuffer(m_xworkspace, poly2, blockSize);

    // copy in the final plaintext bytes
    std::memcpy(outString-blockSize, inString, len);
    // and copy out the final ciphertext bytes
    std::memcpy(outString, m_xworkspace, len);
    // "steal" ciphertext to complete the block
    std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);

    ///// handle the full previous block /////

    inString -= blockSize;
    outString -= blockSize;

    // merge the tweak into the input block
    XorBuffer(m_xworkspace, outString, poly1, blockSize);

    // encrypt one block
    GetBlockCipher().ProcessBlock(m_xworkspace);

    // merge the tweak into the output block
    XorBuffer(outString, m_xworkspace, poly1, blockSize);

    return outLength;
}

NAMESPACE_END

Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		// xts.cpp - written and placed in the public domain by Jeffrey Walton
2
3		// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
4		// base architecture. We can use the SIMD code below without an
5		// architecture option. No runtime tests are required. Unfortunately,
6		// we can't use it on Altivec because an architecture switch is required.
7		// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
8		// 16-byte block sizes.
9
10		#include "pch.h"
11
12		#include "xts.h"
13		#include "misc.h"
14		#include "modes.h"
15		#include "cpu.h"
16
17		#if defined(CRYPTOPP_DEBUG)
18		# include "aes.h"
19		# include "threefish.h"
20		#endif
21
22		// 0.3 to 0.4 cpb profit
23		#if defined(__SSE2__) \|\| defined(_M_X64)
24		# include <emmintrin.h>
25		#endif
26
27		#if defined(__aarch32__) \|\| defined(__aarch64__) \|\| defined(_M_ARM64)
28		# if (CRYPTOPP_ARM_NEON_HEADER) \|\| (CRYPTOPP_ARM_ASIMD_AVAILABLE)
29		# include <arm_neon.h>
30		# endif
31		#endif
32
33		#if defined(__ALTIVEC__)
34		# include "ppc_simd.h"
35		#endif
36
37		ANONYMOUS_NAMESPACE_BEGIN
38
39		using namespace CryptoPP;
40
41		#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
42
43		using CryptoPP::AES;
44		using CryptoPP::XTS_Mode;
45		using CryptoPP::Threefish512;
46
47		void Modes_TestInstantiations()
48		{
49		XTS_Mode<AES>::Encryption m0;
50		XTS_Mode<AES>::Decryption m1;
51		XTS_Mode<AES>::Encryption m2;
52		XTS_Mode<AES>::Decryption m3;
53
54		#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
55		XTS_Mode<Threefish512>::Encryption m4;
56		XTS_Mode<Threefish512>::Decryption m5;
57		#endif
58		}
59		#endif // CRYPTOPP_DEBUG
60
61		inline void XorBuffer(byte output, const byte input, const byte *mask, size_t count)
62	0	{
63	0	CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
64
65		#if defined(CRYPTOPP_DISABLE_ASM)
66		xorbuf(output, input, mask, count);
67
68		#elif defined(__SSE2__) \|\| defined(_M_X64)
69	0	for (size_t i=0; i<count; i+=16)
70	0	_mm_storeu_si128(M128_CAST(output+i),
71	0	_mm_xor_si128(
72	0	_mm_loadu_si128(CONST_M128_CAST(input+i)),
73	0	_mm_loadu_si128(CONST_M128_CAST(mask+i))));
74
75		#elif defined(__aarch32__) \|\| defined(__aarch64__) \|\| defined(_M_ARM64)
76		for (size_t i=0; i<count; i+=16)
77		vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
78
79		#elif defined(__ALTIVEC__)
80		for (size_t i=0; i<count; i+=16)
81		VecStore(VecXor(VecLoad(input+i), VecLoad(mask+i)), output+i);
82
83		#else
84		xorbuf(output, input, mask, count);
85		#endif
86	0	}
87
88		inline void XorBuffer(byte buf, const byte mask, size_t count)
89	0	{
90	0	XorBuffer(buf, buf, mask, count);
91	0	}
92
93		// Borrowed from CMAC, but little-endian representation
94		inline void GF_Double(byte out, const byte in, unsigned int len)
95	0	{
96	0	#if defined(CRYPTOPP_WORD128_AVAILABLE)
97	0	word128 carry = 0, x;
98	0	for (size_t i=0, idx=0; i<len/16; ++i, idx+=16)
99	0	{
100	0	x = GetWord<word128>(false, LITTLE_ENDIAN_ORDER, in+idx);
101	0	word128 y = (x >> 127); x = (x << 1) + carry;
102	0	PutWord<word128>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
103	0	carry = y;
104	0	}
105		#elif defined(_M_X64) \|\| defined(_M_ARM64) \|\| defined(_LP64) \|\| defined(__LP64__)
106		word64 carry = 0, x;
107		for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
108		{
109		x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
110		word64 y = (x >> 63); x = (x << 1) + carry;
111		PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
112		carry = y;
113		}
114		#else
115		word32 carry = 0, x;
116		for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
117		{
118		x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
119		word32 y = (x >> 31); x = (x << 1) + carry;
120		PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
121		carry = y;
122		}
123		#endif
124
125		#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
126
127		CRYPTOPP_ASSERT(IsPowerOf2(len));
128		CRYPTOPP_ASSERT(len >= 16);
129		CRYPTOPP_ASSERT(len <= 128);
130
131		byte* k = out;
132		if (carry)
133		{
134		switch (len)
135		{
136		case 16:
137		{
138		const size_t LEIDX = 16-1;
139		k[LEIDX-15] ^= 0x87;
140		break;
141		}
142		case 32:
143		{
144		// https://crypto.stackexchange.com/q/9815/10496
145		// Polynomial x^256 + x^10 + x^5 + x^2 + 1
146		const size_t LEIDX = 32-1;
147		k[LEIDX-30] ^= 4;
148		k[LEIDX-31] ^= 0x25;
149		break;
150		}
151		case 64:
152		{
153		// https://crypto.stackexchange.com/q/9815/10496
154		// Polynomial x^512 + x^8 + x^5 + x^2 + 1
155		const size_t LEIDX = 64-1;
156		k[LEIDX-62] ^= 1;
157		k[LEIDX-63] ^= 0x25;
158		break;
159		}
160		case 128:
161		{
162		// https://crypto.stackexchange.com/q/9815/10496
163		// Polynomial x^1024 + x^19 + x^6 + x + 1
164		const size_t LEIDX = 128-1;
165		k[LEIDX-125] ^= 8;
166		k[LEIDX-126] ^= 0x00;
167		k[LEIDX-127] ^= 0x43;
168		break;
169		}
170		default:
171		CRYPTOPP_ASSERT(0);
172		}
173		}
174		#else
175	0	CRYPTOPP_ASSERT(len == 16);
176
177	0	byte* k = out;
178	0	if (carry)
179	0	{
180	0	k[0] ^= 0x87;
181	0	return;
182	0	}
183	0	#endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
184	0	}
185
186		inline void GF_Double(byte *inout, unsigned int len)
187	0	{
188	0	GF_Double(inout, inout, len);
189	0	}
190
191		ANONYMOUS_NAMESPACE_END
192
193		NAMESPACE_BEGIN(CryptoPP)
194
195		void XTS_ModeBase::ThrowIfInvalidBlockSize(size_t length)
196	0	{
197		#if CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
198		CRYPTOPP_ASSERT(length >= 16 && length <= 128 && IsPowerOf2(length));
199		if (length < 16 \|\| length > 128 \|\| !IsPowerOf2(length))
200		throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not valid");
201		#else
202	0	CRYPTOPP_ASSERT(length == 16);
203	0	if (length != 16)
204	0	throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
205	0	#endif
206	0	}
207
208		void XTS_ModeBase::ThrowIfInvalidKeyLength(size_t length)
209	2	{
210	2	CRYPTOPP_ASSERT(length % 2 == 0);
211	2	if (!GetBlockCipher().IsValidKeyLength((length+1)/2))
212	2	throw InvalidKeyLength(AlgorithmName(), length);
213	2	}
214
215		void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &params)
216	2	{
217	2	ThrowIfInvalidKeyLength(length);
218	2	ThrowIfInvalidBlockSize(BlockSize());
219
220	2	const size_t klen = length/2;
221	2	AccessBlockCipher().SetKey(key+0, klen, params);
222	2	AccessTweakCipher().SetKey(key+klen, klen, params);
223
224	2	ResizeBuffers();
225
226	2	size_t ivLength;
227	2	const byte *iv = GetIVAndThrowIfInvalid(params, ivLength);
228	2	Resynchronize(iv, (int)ivLength);
229	2	}
230
231		void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
232	0	{
233	0	BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
234	0	std::memcpy(m_xregister, m_register, ivLength);
235	0	GetTweakCipher().ProcessBlock(m_xregister);
236	0	}
237
238		void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
239	0	{
240	0	SecByteBlock iv(GetTweakCipher().BlockSize());
241	0	PutWord<word64>(false, order, iv, sector);
242	0	std::memset(iv+8, 0x00, iv.size()-8);
243
244	0	BlockOrientedCipherModeBase::Resynchronize(iv, (int)iv.size());
245	0	std::memcpy(m_xregister, iv, iv.size());
246	0	GetTweakCipher().ProcessBlock(m_xregister);
247	0	}
248
249		void XTS_ModeBase::ResizeBuffers()
250	6	{
251	6	BlockOrientedCipherModeBase::ResizeBuffers();
252	6	m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
253	6	m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
254	6	}
255
256		// ProcessData runs either 12-4-1 blocks, 8-2-1 or 4-1 blocks. Which is
257		// selected depends on ParallelBlocks in the header file. 12-4-1 or 8-2-1
258		// can be used on Aarch64 and PowerPC. Intel should use 4-1 due to lack
259		// of registers. The unneeded code paths should be removed by optimizer.
260		// The extra gyrations save us 1.8 cpb on Aarch64 and 2.1 cpb on PowerPC.
261		void XTS_ModeBase::ProcessData(byte outString, const byte inString, size_t length)
262	0	{
263		// data unit is multiple of 16 bytes
264	0	CRYPTOPP_ASSERT(length % BlockSize() == 0);
265
266	0	enum { lastParallelBlock = ParallelBlocks-1 };
267	0	const unsigned int blockSize = GetBlockCipher().BlockSize();
268	0	const size_t parallelSize = blockSize*ParallelBlocks;
269
270		// encrypt the data unit, optimal size at a time
271	0	while (length >= parallelSize)
272	0	{
273		// m_xregister[0] always points to the next tweak.
274	0	GF_Double(m_xregister+1blockSize, m_xregister+0blockSize, blockSize);
275	0	GF_Double(m_xregister+2blockSize, m_xregister+1blockSize, blockSize);
276	0	GF_Double(m_xregister+3blockSize, m_xregister+2blockSize, blockSize);
277
278	0	if (ParallelBlocks > 4)
279	0	{
280	0	GF_Double(m_xregister+4blockSize, m_xregister+3blockSize, blockSize);
281	0	GF_Double(m_xregister+5blockSize, m_xregister+4blockSize, blockSize);
282	0	GF_Double(m_xregister+6blockSize, m_xregister+5blockSize, blockSize);
283	0	GF_Double(m_xregister+7blockSize, m_xregister+6blockSize, blockSize);
284	0	}
285	0	if (ParallelBlocks > 8)
286	0	{
287	0	GF_Double(m_xregister+8blockSize, m_xregister+7blockSize, blockSize);
288	0	GF_Double(m_xregister+9blockSize, m_xregister+8blockSize, blockSize);
289	0	GF_Double(m_xregister+10blockSize, m_xregister+9blockSize, blockSize);
290	0	GF_Double(m_xregister+11blockSize, m_xregister+10blockSize, blockSize);
291	0	}
292
293		// merge the tweak into the input block
294	0	XorBuffer(m_xworkspace, inString, m_xregister, parallelSize);
295
296		// encrypt one block, merge the tweak into the output block
297	0	GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
298	0	outString, parallelSize, BlockTransformation::BT_AllowParallel);
299
300		// m_xregister[0] always points to the next tweak.
301	0	GF_Double(m_xregister+0, m_xregister+lastParallelBlock*blockSize, blockSize);
302
303	0	inString += parallelSize;
304	0	outString += parallelSize;
305	0	length -= parallelSize;
306	0	}
307
308		// encrypt the data unit, 4 blocks at a time
309	0	while (ParallelBlocks == 12 && length >= blockSize*4)
310	0	{
311		// m_xregister[0] always points to the next tweak.
312	0	GF_Double(m_xregister+1blockSize, m_xregister+0blockSize, blockSize);
313	0	GF_Double(m_xregister+2blockSize, m_xregister+1blockSize, blockSize);
314	0	GF_Double(m_xregister+3blockSize, m_xregister+2blockSize, blockSize);
315
316		// merge the tweak into the input block
317	0	XorBuffer(m_xworkspace, inString, m_xregister, blockSize*4);
318
319		// encrypt one block, merge the tweak into the output block
320	0	GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
321	0	outString, blockSize*4, BlockTransformation::BT_AllowParallel);
322
323		// m_xregister[0] always points to the next tweak.
324	0	GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
325
326	0	inString += blockSize*4;
327	0	outString += blockSize*4;
328	0	length -= blockSize*4;
329	0	}
330
331		// encrypt the data unit, 2 blocks at a time
332	0	while (ParallelBlocks == 8 && length >= blockSize*2)
333	0	{
334		// m_xregister[0] always points to the next tweak.
335	0	GF_Double(m_xregister+1blockSize, m_xregister+0blockSize, blockSize);
336
337		// merge the tweak into the input block
338	0	XorBuffer(m_xworkspace, inString, m_xregister, blockSize*2);
339
340		// encrypt one block, merge the tweak into the output block
341	0	GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister,
342	0	outString, blockSize*2, BlockTransformation::BT_AllowParallel);
343
344		// m_xregister[0] always points to the next tweak.
345	0	GF_Double(m_xregister+0, m_xregister+1*blockSize, blockSize);
346
347	0	inString += blockSize*2;
348	0	outString += blockSize*2;
349	0	length -= blockSize*2;
350	0	}
351
352		// encrypt the data unit, blocksize at a time
353	0	while (length)
354	0	{
355		// merge the tweak into the input block
356	0	XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
357
358		// encrypt one block
359	0	GetBlockCipher().ProcessBlock(m_xworkspace);
360
361		// merge the tweak into the output block
362	0	XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
363
364		// Multiply T by alpha
365	0	GF_Double(m_xregister, blockSize);
366
367	0	inString += blockSize;
368	0	outString += blockSize;
369	0	length -= blockSize;
370	0	}
371	0	}
372
373		size_t XTS_ModeBase::ProcessLastBlock(byte outString, size_t outLength, const byte inString, size_t inLength)
374	0	{
375		// need at least a full AES block
376	0	CRYPTOPP_ASSERT(inLength >= BlockSize());
377
378	0	if (inLength < BlockSize())
379	0	throw InvalidArgument("XTS: message is too short for ciphertext stealing");
380
381	0	if (IsForwardTransformation())
382	0	return ProcessLastPlainBlock(outString, outLength, inString, inLength);
383	0	else
384	0	return ProcessLastCipherBlock(outString, outLength, inString, inLength);
385	0	}
386
387		size_t XTS_ModeBase::ProcessLastPlainBlock(byte outString, size_t outLength, const byte inString, size_t inLength)
388	0	{
389		// ensure output buffer is large enough
390	0	CRYPTOPP_ASSERT(outLength >= inLength);
391
392	0	const unsigned int blockSize = GetBlockCipher().BlockSize();
393	0	const size_t blocks = inLength / blockSize;
394	0	const size_t tail = inLength % blockSize;
395	0	outLength = inLength;
396
397	0	if (tail == 0)
398	0	{
399		// Allow ProcessData to handle all the full blocks
400	0	ProcessData(outString, inString, inLength);
401	0	return inLength;
402	0	}
403	0	else if (blocks > 1)
404	0	{
405		// Allow ProcessData to handle full blocks except one
406	0	const size_t head = (blocks-1)*blockSize;
407	0	ProcessData(outString, inString, inLength-head);
408
409	0	outString += head;
410	0	inString += head; inLength -= head;
411	0	}
412
413		///// handle the full block /////
414
415		// merge the tweak into the input block
416	0	XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
417
418		// encrypt one block
419	0	GetBlockCipher().ProcessBlock(m_xworkspace);
420
421		// merge the tweak into the output block
422	0	XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
423
424		// Multiply T by alpha
425	0	GF_Double(m_xregister, blockSize);
426
427		///// handle final partial block /////
428
429	0	inString += blockSize;
430	0	outString += blockSize;
431	0	const size_t len = inLength-blockSize;
432
433		// copy in the final plaintext bytes
434	0	std::memcpy(m_xworkspace, inString, len);
435		// and copy out the final ciphertext bytes
436	0	std::memcpy(outString, outString-blockSize, len);
437		// "steal" ciphertext to complete the block
438	0	std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
439
440		// merge the tweak into the input block
441	0	XorBuffer(m_xworkspace, m_xregister, blockSize);
442
443		// encrypt one block
444	0	GetBlockCipher().ProcessBlock(m_xworkspace);
445
446		// merge the tweak into the previous output block
447	0	XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
448
449	0	return outLength;
450	0	}
451
452		size_t XTS_ModeBase::ProcessLastCipherBlock(byte outString, size_t outLength, const byte inString, size_t inLength)
453	0	{
454		// ensure output buffer is large enough
455	0	CRYPTOPP_ASSERT(outLength >= inLength);
456
457	0	const unsigned int blockSize = GetBlockCipher().BlockSize();
458	0	const size_t blocks = inLength / blockSize;
459	0	const size_t tail = inLength % blockSize;
460	0	outLength = inLength;
461
462	0	if (tail == 0)
463	0	{
464		// Allow ProcessData to handle all the full blocks
465	0	ProcessData(outString, inString, inLength);
466	0	return inLength;
467	0	}
468	0	else if (blocks > 1)
469	0	{
470		// Allow ProcessData to handle full blocks except one
471	0	const size_t head = (blocks-1)*blockSize;
472	0	ProcessData(outString, inString, inLength-head);
473
474	0	outString += head;
475	0	inString += head; inLength -= head;
476	0	}
477
478	0	#define poly1 (m_xregister+0*blockSize)
479	0	#define poly2 (m_xregister+1*blockSize)
480	0	GF_Double(poly2, poly1, blockSize);
481
482		///// handle final partial block /////
483
484	0	inString += blockSize;
485	0	outString += blockSize;
486	0	const size_t len = inLength-blockSize;
487
488		// merge the tweak into the input block
489	0	XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
490
491		// encrypt one block
492	0	GetBlockCipher().ProcessBlock(m_xworkspace);
493
494		// merge the tweak into the output block
495	0	XorBuffer(m_xworkspace, poly2, blockSize);
496
497		// copy in the final plaintext bytes
498	0	std::memcpy(outString-blockSize, inString, len);
499		// and copy out the final ciphertext bytes
500	0	std::memcpy(outString, m_xworkspace, len);
501		// "steal" ciphertext to complete the block
502	0	std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
503
504		///// handle the full previous block /////
505
506	0	inString -= blockSize;
507	0	outString -= blockSize;
508
509		// merge the tweak into the input block
510	0	XorBuffer(m_xworkspace, outString, poly1, blockSize);
511
512		// encrypt one block
513	0	GetBlockCipher().ProcessBlock(m_xworkspace);
514
515		// merge the tweak into the output block
516	0	XorBuffer(outString, m_xworkspace, poly1, blockSize);
517
518	0	return outLength;
519	0	}
520
521		NAMESPACE_END