/src/botan/src/lib/block/shacal2/shacal2_x86/shacal2_x86.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * SHACAL-2 using x86 SHA extensions |
3 | | * (C) 2017 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/shacal2.h> |
9 | | #include <immintrin.h> |
10 | | |
11 | | namespace Botan { |
12 | | |
13 | | /* |
14 | | Only encryption is supported since the inverse round function would |
15 | | require a different instruction |
16 | | */ |
17 | | |
18 | | BOTAN_FUNC_ISA("sha,ssse3") |
19 | | void SHACAL2::x86_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const |
20 | 0 | { |
21 | 0 | const __m128i MASK1 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); |
22 | 0 | const __m128i MASK2 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); |
23 | 0 |
|
24 | 0 | const __m128i* RK_mm = reinterpret_cast<const __m128i*>(m_RK.data()); |
25 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
26 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
27 | 0 |
|
28 | 0 | while(blocks >= 2) |
29 | 0 | { |
30 | 0 | __m128i B0_0 = _mm_loadu_si128(in_mm); |
31 | 0 | __m128i B0_1 = _mm_loadu_si128(in_mm+1); |
32 | 0 | __m128i B1_0 = _mm_loadu_si128(in_mm+2); |
33 | 0 | __m128i B1_1 = _mm_loadu_si128(in_mm+3); |
34 | 0 |
|
35 | 0 | __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK2); |
36 | 0 | B0_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK2); |
37 | 0 | B0_0 = TMP; |
38 | 0 |
|
39 | 0 | TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B1_0, B1_1), MASK2); |
40 | 0 | B1_1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B1_0, B1_1), MASK2); |
41 | 0 | B1_0 = TMP; |
42 | 0 |
|
43 | 0 | for(size_t i = 0; i != 8; ++i) |
44 | 0 | { |
45 | 0 | const __m128i RK0 = _mm_loadu_si128(RK_mm + 2*i); |
46 | 0 | const __m128i RK2 = _mm_loadu_si128(RK_mm + 2*i+1); |
47 | 0 | const __m128i RK1 = _mm_srli_si128(RK0, 8); |
48 | 0 | const __m128i RK3 = _mm_srli_si128(RK2, 8); |
49 | 0 |
|
50 | 0 | B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK0); |
51 | 0 | B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK0); |
52 | 0 |
|
53 | 0 | B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK1); |
54 | 0 | B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK1); |
55 | 0 |
|
56 | 0 | B0_1 = _mm_sha256rnds2_epu32(B0_1, B0_0, RK2); |
57 | 0 | B1_1 = _mm_sha256rnds2_epu32(B1_1, B1_0, RK2); |
58 | 0 |
|
59 | 0 | B0_0 = _mm_sha256rnds2_epu32(B0_0, B0_1, RK3); |
60 | 0 | B1_0 = _mm_sha256rnds2_epu32(B1_0, B1_1, RK3); |
61 | 0 | } |
62 | 0 |
|
63 | 0 | TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0_0, B0_1), MASK1); |
64 | 0 | B0_1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0_0, B0_1), MASK1); |
65 | 0 | B0_0 = TMP; |
66 | 0 |
|
67 | 0 | TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B1_0, B1_1), MASK1); |
68 | 0 | B1_1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B1_0, B1_1), MASK1); |
69 | 0 | B1_0 = TMP; |
70 | 0 |
|
71 | 0 | // Save state |
72 | 0 | _mm_storeu_si128(out_mm + 0, B0_0); |
73 | 0 | _mm_storeu_si128(out_mm + 1, B0_1); |
74 | 0 | _mm_storeu_si128(out_mm + 2, B1_0); |
75 | 0 | _mm_storeu_si128(out_mm + 3, B1_1); |
76 | 0 |
|
77 | 0 | blocks -= 2; |
78 | 0 | in_mm += 4; |
79 | 0 | out_mm += 4; |
80 | 0 | } |
81 | 0 |
|
82 | 0 | while(blocks) |
83 | 0 | { |
84 | 0 | __m128i B0 = _mm_loadu_si128(in_mm); |
85 | 0 | __m128i B1 = _mm_loadu_si128(in_mm+1); |
86 | 0 |
|
87 | 0 | __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2); |
88 | 0 | B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2); |
89 | 0 | B0 = TMP; |
90 | 0 |
|
91 | 0 | for(size_t i = 0; i != 8; ++i) |
92 | 0 | { |
93 | 0 | const __m128i RK0 = _mm_loadu_si128(RK_mm + 2*i); |
94 | 0 | const __m128i RK2 = _mm_loadu_si128(RK_mm + 2*i+1); |
95 | 0 | const __m128i RK1 = _mm_srli_si128(RK0, 8); |
96 | 0 | const __m128i RK3 = _mm_srli_si128(RK2, 8); |
97 | 0 |
|
98 | 0 | B1 = _mm_sha256rnds2_epu32(B1, B0, RK0); |
99 | 0 | B0 = _mm_sha256rnds2_epu32(B0, B1, RK1); |
100 | 0 | B1 = _mm_sha256rnds2_epu32(B1, B0, RK2); |
101 | 0 | B0 = _mm_sha256rnds2_epu32(B0, B1, RK3); |
102 | 0 | } |
103 | 0 |
|
104 | 0 | TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1); |
105 | 0 | B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1); |
106 | 0 | B0 = TMP; |
107 | 0 |
|
108 | 0 | // Save state |
109 | 0 | _mm_storeu_si128(out_mm, B0); |
110 | 0 | _mm_storeu_si128(out_mm + 1, B1); |
111 | 0 |
|
112 | 0 | blocks--; |
113 | 0 | in_mm += 2; |
114 | 0 | out_mm += 2; |
115 | 0 | } |
116 | 0 | } |
117 | | |
118 | | } |