/src/botan/src/lib/utils/ghash/ghash_cpu/ghash_cpu.cpp

Source (jump to first uncovered line)
/*
* Hook for CLMUL/PMULL/VPMSUM
* (C) 2013,2017,2019,2020 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/

#include <botan/internal/ghash.h>

#include <botan/internal/simd_32.h>

#if defined(BOTAN_SIMD_USE_SSE2)
   #include <immintrin.h>
   #include <wmmintrin.h>
#endif

namespace Botan {

namespace {

BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in) {
#if defined(BOTAN_SIMD_USE_SSE2)
   const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
   return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
#elif defined(BOTAN_SIMD_USE_NEON)
   const uint8_t maskb[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
   const uint8x16_t mask = vld1q_u8(maskb);
   return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
   const __vector unsigned char mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
   return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask));
#endif
}

template <int M>
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x) {
   static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");

#if defined(BOTAN_SIMD_USE_SSE2)
   return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
#elif defined(BOTAN_SIMD_USE_NEON)
   const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
   const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
   return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
#elif defined(BOTAN_SIMD_USE_ALTIVEC)
   const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);

   SIMD_4x32 i1 = x;
   SIMD_4x32 i2 = H;

   if(M == 0x11) {
      i1 &= mask_lo;
      i2 &= mask_lo;
   } else if(M == 0x10) {
      i1 = i1.shift_elems_left<2>();
   } else if(M == 0x01) {
      i2 = i2.shift_elems_left<2>();
   } else if(M == 0x00) {
      i1 = mask_lo.andc(i1);
      i2 = mask_lo.andc(i2);
   }

   auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw());
   auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw());

   #if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vpmsumd)
   auto rv = __builtin_crypto_vpmsumd(i1v, i2v);
   #else
   auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v);
   #endif

   return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv));
#endif
}

inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1) {
   SIMD_4x32 X0 = B1.shr<31>();
   SIMD_4x32 X1 = B1.shl<1>();
   SIMD_4x32 X2 = B0.shr<31>();
   SIMD_4x32 X3 = B0.shl<1>();

   X3 |= X0.shift_elems_right<3>();
   X3 |= X2.shift_elems_left<1>();
   X1 |= X0.shift_elems_left<1>();

   X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();

   X1 ^= X0.shift_elems_left<3>();

   X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
   X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
   return X0;
}

inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x) {
   SIMD_4x32 T0 = clmul<0x11>(H, x);
   SIMD_4x32 T1 = clmul<0x10>(H, x);
   SIMD_4x32 T2 = clmul<0x01>(H, x);
   SIMD_4x32 T3 = clmul<0x00>(H, x);

   T1 ^= T2;
   T0 ^= T1.shift_elems_right<2>();
   T3 ^= T1.shift_elems_left<2>();

   return gcm_reduce(T0, T3);
}

inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply_x4(const SIMD_4x32& H1,
                                                                 const SIMD_4x32& H2,
                                                                 const SIMD_4x32& H3,
                                                                 const SIMD_4x32& H4,
                                                                 const SIMD_4x32& X1,
                                                                 const SIMD_4x32& X2,
                                                                 const SIMD_4x32& X3,
                                                                 const SIMD_4x32& X4) {
   /*
   * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
   * and Pierre Laurent of Intel
   */

   const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^ (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));

   const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^ (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));

   SIMD_4x32 T;

   T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
   T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
   T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
   T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
   T ^= lo;
   T ^= hi;

   return gcm_reduce(hi ^ T.shift_elems_right<2>(), lo ^ T.shift_elems_left<2>());
}

}  // namespace

BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) void GHASH::ghash_precompute_cpu(const uint8_t H_bytes[16], uint64_t H_pow[4 * 2]) {
   const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes));
   const SIMD_4x32 H2 = gcm_multiply(H1, H1);
   const SIMD_4x32 H3 = gcm_multiply(H1, H2);
   const SIMD_4x32 H4 = gcm_multiply(H2, H2);

   H1.store_le(H_pow);
   H2.store_le(H_pow + 2);
   H3.store_le(H_pow + 4);
   H4.store_le(H_pow + 6);
}

BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
void GHASH::ghash_multiply_cpu(uint8_t x[16], const uint64_t H_pow[8], const uint8_t input[], size_t blocks) {
   /*
   * Algorithms 1 and 5 from Intel's CLMUL guide
   */
   const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow);

   SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));

   if(blocks >= 4) {
      const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2);
      const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4);
      const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6);

      while(blocks >= 4) {
         const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input));
         const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16 * 1));
         const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16 * 2));
         const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16 * 3));

         a ^= m0;
         a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);

         input += 4 * 16;
         blocks -= 4;
      }
   }

   for(size_t i = 0; i != blocks; ++i) {
      const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16 * i));

      a ^= m;
      a = gcm_multiply(H1, a);
   }

   a = reverse_vector(a);
   a.store_le(x);
}

}  // namespace Botan

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Hook for CLMUL/PMULL/VPMSUM
3		* (C) 2013,2017,2019,2020 Jack Lloyd
4		*
5		* Botan is released under the Simplified BSD License (see license.txt)
6		*/
7
8		#include <botan/internal/ghash.h>
9
10		#include <botan/internal/simd_32.h>
11
12		#if defined(BOTAN_SIMD_USE_SSE2)
13		#include <immintrin.h>
14		#include <wmmintrin.h>
15		#endif
16
17		namespace Botan {
18
19		namespace {
20
21	0	BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in) {
22	0	#if defined(BOTAN_SIMD_USE_SSE2)
23	0	const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
24	0	return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
25		#elif defined(BOTAN_SIMD_USE_NEON)
26		const uint8_t maskb[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
27		const uint8x16_t mask = vld1q_u8(maskb);
28		return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
29		#elif defined(BOTAN_SIMD_USE_ALTIVEC)
30		const __vector unsigned char mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
31		return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask));
32		#endif
33	0	}
34
35		template <int M>
36	0	BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x) {
37	0	static_assert(M == 0x00 \|\| M == 0x01 \|\| M == 0x10 \|\| M == 0x11, "Valid clmul mode");
38
39	0	#if defined(BOTAN_SIMD_USE_SSE2)
40	0	return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
41		#elif defined(BOTAN_SIMD_USE_NEON)
42		const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
43		const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
44		return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
45		#elif defined(BOTAN_SIMD_USE_ALTIVEC)
46		const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
47
48		SIMD_4x32 i1 = x;
49		SIMD_4x32 i2 = H;
50
51		if(M == 0x11) {
52		i1 &= mask_lo;
53		i2 &= mask_lo;
54		} else if(M == 0x10) {
55		i1 = i1.shift_elems_left<2>();
56		} else if(M == 0x01) {
57		i2 = i2.shift_elems_left<2>();
58		} else if(M == 0x00) {
59		i1 = mask_lo.andc(i1);
60		i2 = mask_lo.andc(i2);
61		}
62
63		auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw());
64		auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw());
65
66		#if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vpmsumd)
67		auto rv = __builtin_crypto_vpmsumd(i1v, i2v);
68		#else
69		auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v);
70		#endif
71
72		return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv));
73		#endif
74	0	} Unexecuted instantiation: ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<17>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Unexecuted instantiation: ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<16>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Unexecuted instantiation: ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<1>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&) Unexecuted instantiation: ghash_cpu.cpp:Botan::SIMD_4x32 Botan::(anonymous namespace)::clmul<0>(Botan::SIMD_4x32 const&, Botan::SIMD_4x32 const&)
75
76	0	inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1) {
77	0	SIMD_4x32 X0 = B1.shr<31>();
78	0	SIMD_4x32 X1 = B1.shl<1>();
79	0	SIMD_4x32 X2 = B0.shr<31>();
80	0	SIMD_4x32 X3 = B0.shl<1>();
81
82	0	X3 \|= X0.shift_elems_right<3>();
83	0	X3 \|= X2.shift_elems_left<1>();
84	0	X1 \|= X0.shift_elems_left<1>();
85
86	0	X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();
87
88	0	X1 ^= X0.shift_elems_left<3>();
89
90	0	X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
91	0	X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
92	0	return X0;
93	0	}
94
95	0	inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x) {
96	0	SIMD_4x32 T0 = clmul<0x11>(H, x);
97	0	SIMD_4x32 T1 = clmul<0x10>(H, x);
98	0	SIMD_4x32 T2 = clmul<0x01>(H, x);
99	0	SIMD_4x32 T3 = clmul<0x00>(H, x);
100
101	0	T1 ^= T2;
102	0	T0 ^= T1.shift_elems_right<2>();
103	0	T3 ^= T1.shift_elems_left<2>();
104
105	0	return gcm_reduce(T0, T3);
106	0	}
107
108		inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply_x4(const SIMD_4x32& H1,
109		const SIMD_4x32& H2,
110		const SIMD_4x32& H3,
111		const SIMD_4x32& H4,
112		const SIMD_4x32& X1,
113		const SIMD_4x32& X2,
114		const SIMD_4x32& X3,
115	0	const SIMD_4x32& X4) {
116		/*
117		* Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
118		* and Pierre Laurent of Intel
119		*/
120
121	0	const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^ (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));
122
123	0	const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^ (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));
124
125	0	SIMD_4x32 T;
126
127	0	T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
128	0	T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
129	0	T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
130	0	T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
131	0	T ^= lo;
132	0	T ^= hi;
133
134	0	return gcm_reduce(hi ^ T.shift_elems_right<2>(), lo ^ T.shift_elems_left<2>());
135	0	}
136
137		} // namespace
138
139	0	BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) void GHASH::ghash_precompute_cpu(const uint8_t H_bytes[16], uint64_t H_pow[4 * 2]) {
140	0	const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes));
141	0	const SIMD_4x32 H2 = gcm_multiply(H1, H1);
142	0	const SIMD_4x32 H3 = gcm_multiply(H1, H2);
143	0	const SIMD_4x32 H4 = gcm_multiply(H2, H2);
144
145	0	H1.store_le(H_pow);
146	0	H2.store_le(H_pow + 2);
147	0	H3.store_le(H_pow + 4);
148	0	H4.store_le(H_pow + 6);
149	0	}
150
151		BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
152	0	void GHASH::ghash_multiply_cpu(uint8_t x[16], const uint64_t H_pow[8], const uint8_t input[], size_t blocks) {
153		/*
154		* Algorithms 1 and 5 from Intel's CLMUL guide
155		*/
156	0	const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow);
157
158	0	SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));
159
160	0	if(blocks >= 4) {
161	0	const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2);
162	0	const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4);
163	0	const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6);
164
165	0	while(blocks >= 4) {
166	0	const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input));
167	0	const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16 * 1));
168	0	const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16 * 2));
169	0	const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16 * 3));
170
171	0	a ^= m0;
172	0	a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);
173
174	0	input += 4 * 16;
175	0	blocks -= 4;
176	0	}
177	0	}
178
179	0	for(size_t i = 0; i != blocks; ++i) {
180	0	const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16 * i));
181
182	0	a ^= m;
183	0	a = gcm_multiply(H1, a);
184	0	}
185
186	0	a = reverse_vector(a);
187	0	a.store_le(x);
188	0	}
189
190		} // namespace Botan

Coverage Report

Created: 2023-06-07 07:00