/src/botan/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * SHACAL-2 using x86 SHA extensions |
3 | | * (C) 2017 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/internal/shacal2.h> |
9 | | #include <immintrin.h> |
10 | | |
11 | | namespace Botan { |
12 | | |
13 | | /* |
14 | | Only encryption is supported since the inverse round function would |
15 | | require a different instruction |
16 | | */ |
17 | | |
18 | | BOTAN_FUNC_ISA("sha,ssse3") |
19 | | void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const |
20 | 0 | { |
21 | 0 | const __m128i MASK1 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); |
22 | 0 | const __m128i MASK2 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); |
23 | |
|
24 | 0 | const __m128i* RK_mm = reinterpret_cast<const __m128i*>(m_RK.data()); |
25 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
26 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
27 | |
|
28 | 0 | while(blocks >= 2) |
29 | 0 | { |
30 | 0 | __m128i B0_0 = _mm_loadu_si128(in_mm); |
31 | 0 | __m128i B0_1 = _mm_loadu_si128(in_mm+1); |
32 | 0 | __m128i B1_0 = _mm_loadu_si128(in_mm+2); |
33 | 0 | __m128i B1_1 = _mm_loadu_si128(in_mm+3); |
34 | |
|
35 | 0 | __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK2); |
36 | 0 | B0_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK2); |
37 | 0 | B0_0 = TMP; |
38 | |
|
39 | 0 | TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B1_0, B1_1), MASK2); |
40 | 0 | B1_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B1_0, B1_1), MASK2); |
41 | 0 | B1_0 = TMP; |
42 | |
|
43 | 0 | for(size_t i = 0; i != 8; ++i) |
44 | 0 | { |
45 | 0 | const __m128i RK0 = _mm_loadu_si128(RK_mm + 2*i); |
46 | 0 | const __m128i RK2 = _mm_loadu_si128(RK_mm + 2*i+1); |
47 | 0 | const __m128i RK1 = _mm_srli_si128(RK0, 8); |
48 | 0 | const __m128i RK3 = _mm_srli_si128(RK2, 8); |
49 | |
|
50 | 0 | B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK0); |
51 | 0 | B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK0); |
52 | |
|
53 | 0 | B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK1); |
54 | 0 | B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK1); |
55 | |
|
56 | 0 | B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK2); |
57 | 0 | B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK2); |
58 | |
|
59 | 0 | B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK3); |
60 | 0 | B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK3); |
61 | 0 | } |
62 | |
|
63 | 0 | _mm_storeu_si128(out_mm + 0, _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK1)); |
64 | 0 | _mm_storeu_si128(out_mm + 1, _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK1)); |
65 | 0 | _mm_storeu_si128(out_mm + 2, _mm_shuffle_epi8(_mm_unpackhi_epi64(B1_0, B1_1), MASK1)); |
66 | 0 | _mm_storeu_si128(out_mm + 3, _mm_shuffle_epi8(_mm_unpacklo_epi64(B1_0, B1_1), MASK1)); |
67 | |
|
68 | 0 | blocks -= 2; |
69 | 0 | in_mm += 4; |
70 | 0 | out_mm += 4; |
71 | 0 | } |
72 | |
|
73 | 0 | while(blocks) |
74 | 0 | { |
75 | 0 | __m128i B0 = _mm_loadu_si128(in_mm); |
76 | 0 | __m128i B1 = _mm_loadu_si128(in_mm+1); |
77 | |
|
78 | 0 | __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2); |
79 | 0 | B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2); |
80 | 0 | B0 = TMP; |
81 | |
|
82 | 0 | for(size_t i = 0; i != 8; ++i) |
83 | 0 | { |
84 | 0 | const __m128i RK0 = _mm_loadu_si128(RK_mm + 2*i); |
85 | 0 | const __m128i RK2 = _mm_loadu_si128(RK_mm + 2*i+1); |
86 | 0 | const __m128i RK1 = _mm_srli_si128(RK0, 8); |
87 | 0 | const __m128i RK3 = _mm_srli_si128(RK2, 8); |
88 | |
|
89 | 0 | B1 = _mm_sha256rnds2_epu32(B1, B0, RK0); |
90 | 0 | B0 = _mm_sha256rnds2_epu32(B0, B1, RK1); |
91 | 0 | B1 = _mm_sha256rnds2_epu32(B1, B0, RK2); |
92 | 0 | B0 = _mm_sha256rnds2_epu32(B0, B1, RK3); |
93 | 0 | } |
94 | |
|
95 | 0 | _mm_storeu_si128(out_mm , _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1)); |
96 | 0 | _mm_storeu_si128(out_mm + 1, _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1)); |
97 | |
|
98 | 0 | blocks--; |
99 | 0 | in_mm += 2; |
100 | 0 | out_mm += 2; |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | } |