/src/botan/src/lib/block/idea/idea_sse2/idea_sse2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * IDEA in SSE2 |
3 | | * (C) 2009 Jack Lloyd |
4 | | * |
5 | | * Botan is released under the Simplified BSD License (see license.txt) |
6 | | */ |
7 | | |
8 | | #include <botan/idea.h> |
9 | | #include <botan/internal/ct_utils.h> |
10 | | #include <emmintrin.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | namespace { |
15 | | |
16 | | BOTAN_FUNC_ISA("sse2") |
17 | | inline __m128i mul(__m128i X, uint16_t K_16) |
18 | 0 | { |
19 | 0 | const __m128i zeros = _mm_set1_epi16(0); |
20 | 0 | const __m128i ones = _mm_set1_epi16(1); |
21 | 0 |
|
22 | 0 | const __m128i K = _mm_set1_epi16(K_16); |
23 | 0 |
|
24 | 0 | const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros); |
25 | 0 | const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros); |
26 | 0 |
|
27 | 0 | const __m128i mul_lo = _mm_mullo_epi16(X, K); |
28 | 0 | const __m128i mul_hi = _mm_mulhi_epu16(X, K); |
29 | 0 |
|
30 | 0 | __m128i T = _mm_sub_epi16(mul_lo, mul_hi); |
31 | 0 |
|
32 | | // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0 |
33 | 0 | const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo); |
34 | 0 | const __m128i cmp = _mm_min_epu8( |
35 | 0 | _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones); |
36 | 0 |
|
37 | 0 | T = _mm_add_epi16(T, cmp); |
38 | 0 |
|
39 | | /* Selection: if X[i] is zero then assign 1-K |
40 | | if K is zero then assign 1-X[i] |
41 | | |
42 | | Could if() off value of K_16 for the second, but this gives a |
43 | | constant time implementation which is a nice bonus. |
44 | | */ |
45 | 0 |
|
46 | 0 | T = _mm_or_si128( |
47 | 0 | _mm_andnot_si128(X_is_zero, T), |
48 | 0 | _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero)); |
49 | 0 |
|
50 | 0 | T = _mm_or_si128( |
51 | 0 | _mm_andnot_si128(K_is_zero, T), |
52 | 0 | _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero)); |
53 | 0 |
|
54 | 0 | return T; |
55 | 0 | } |
56 | | |
57 | | /* |
58 | | * 4x8 matrix transpose |
59 | | * |
60 | | * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in |
61 | | * transpose_out doesn't need it. Something with the shuffle? Removing |
62 | | * that extra unpack could easily save 3-4 cycles per block, and would |
63 | | * also help a lot with register pressure on 32-bit x86 |
64 | | */ |
65 | | BOTAN_FUNC_ISA("sse2") |
66 | | void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) |
67 | 0 | { |
68 | 0 | __m128i T0 = _mm_unpackhi_epi32(B0, B1); |
69 | 0 | __m128i T1 = _mm_unpacklo_epi32(B0, B1); |
70 | 0 | __m128i T2 = _mm_unpackhi_epi32(B2, B3); |
71 | 0 | __m128i T3 = _mm_unpacklo_epi32(B2, B3); |
72 | 0 |
|
73 | 0 | __m128i T4 = _mm_unpacklo_epi32(T0, T1); |
74 | 0 | __m128i T5 = _mm_unpackhi_epi32(T0, T1); |
75 | 0 | __m128i T6 = _mm_unpacklo_epi32(T2, T3); |
76 | 0 | __m128i T7 = _mm_unpackhi_epi32(T2, T3); |
77 | 0 |
|
78 | 0 | T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2)); |
79 | 0 | T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2)); |
80 | 0 | T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2)); |
81 | 0 | T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2)); |
82 | 0 |
|
83 | 0 | T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2)); |
84 | 0 | T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2)); |
85 | 0 | T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2)); |
86 | 0 | T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2)); |
87 | 0 |
|
88 | 0 | T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); |
89 | 0 | T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); |
90 | 0 | T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); |
91 | 0 | T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); |
92 | 0 |
|
93 | 0 | B0 = _mm_unpacklo_epi64(T0, T2); |
94 | 0 | B1 = _mm_unpackhi_epi64(T0, T2); |
95 | 0 | B2 = _mm_unpacklo_epi64(T1, T3); |
96 | 0 | B3 = _mm_unpackhi_epi64(T1, T3); |
97 | 0 | } |
98 | | |
99 | | /* |
100 | | * 4x8 matrix transpose (reverse) |
101 | | */ |
102 | | BOTAN_FUNC_ISA("sse2") |
103 | | void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) |
104 | 0 | { |
105 | 0 | __m128i T0 = _mm_unpacklo_epi64(B0, B1); |
106 | 0 | __m128i T1 = _mm_unpacklo_epi64(B2, B3); |
107 | 0 | __m128i T2 = _mm_unpackhi_epi64(B0, B1); |
108 | 0 | __m128i T3 = _mm_unpackhi_epi64(B2, B3); |
109 | 0 |
|
110 | 0 | T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); |
111 | 0 | T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); |
112 | 0 | T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); |
113 | 0 | T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); |
114 | 0 |
|
115 | 0 | T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); |
116 | 0 | T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); |
117 | 0 | T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); |
118 | 0 | T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); |
119 | 0 |
|
120 | 0 | T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); |
121 | 0 | T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); |
122 | 0 | T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); |
123 | 0 | T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); |
124 | 0 |
|
125 | 0 | B0 = _mm_unpacklo_epi32(T0, T1); |
126 | 0 | B1 = _mm_unpackhi_epi32(T0, T1); |
127 | 0 | B2 = _mm_unpacklo_epi32(T2, T3); |
128 | 0 | B3 = _mm_unpackhi_epi32(T2, T3); |
129 | 0 | } |
130 | | |
131 | | } |
132 | | |
133 | | /* |
134 | | * 8 wide IDEA encryption/decryption in SSE2 |
135 | | */ |
136 | | BOTAN_FUNC_ISA("sse2") |
137 | | void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) const |
138 | 0 | { |
139 | 0 | CT::poison(in, 64); |
140 | 0 | CT::poison(out, 64); |
141 | 0 | CT::poison(EK, 52); |
142 | 0 |
|
143 | 0 | const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); |
144 | 0 |
|
145 | 0 | __m128i B0 = _mm_loadu_si128(in_mm + 0); |
146 | 0 | __m128i B1 = _mm_loadu_si128(in_mm + 1); |
147 | 0 | __m128i B2 = _mm_loadu_si128(in_mm + 2); |
148 | 0 | __m128i B3 = _mm_loadu_si128(in_mm + 3); |
149 | 0 |
|
150 | 0 | transpose_in(B0, B1, B2, B3); |
151 | 0 |
|
152 | | // byte swap |
153 | 0 | B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8)); |
154 | 0 | B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8)); |
155 | 0 | B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8)); |
156 | 0 | B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8)); |
157 | 0 |
|
158 | 0 | for(size_t i = 0; i != 8; ++i) |
159 | 0 | { |
160 | 0 | B0 = mul(B0, EK[6*i+0]); |
161 | 0 | B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1])); |
162 | 0 | B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2])); |
163 | 0 | B3 = mul(B3, EK[6*i+3]); |
164 | 0 |
|
165 | 0 | __m128i T0 = B2; |
166 | 0 | B2 = _mm_xor_si128(B2, B0); |
167 | 0 | B2 = mul(B2, EK[6*i+4]); |
168 | 0 |
|
169 | 0 | __m128i T1 = B1; |
170 | 0 |
|
171 | 0 | B1 = _mm_xor_si128(B1, B3); |
172 | 0 | B1 = _mm_add_epi16(B1, B2); |
173 | 0 | B1 = mul(B1, EK[6*i+5]); |
174 | 0 |
|
175 | 0 | B2 = _mm_add_epi16(B2, B1); |
176 | 0 |
|
177 | 0 | B0 = _mm_xor_si128(B0, B1); |
178 | 0 | B1 = _mm_xor_si128(B1, T0); |
179 | 0 | B3 = _mm_xor_si128(B3, B2); |
180 | 0 | B2 = _mm_xor_si128(B2, T1); |
181 | 0 | } |
182 | 0 |
|
183 | 0 | B0 = mul(B0, EK[48]); |
184 | 0 | B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50])); |
185 | 0 | B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49])); |
186 | 0 | B3 = mul(B3, EK[51]); |
187 | 0 |
|
188 | | // byte swap |
189 | 0 | B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8)); |
190 | 0 | B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8)); |
191 | 0 | B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8)); |
192 | 0 | B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8)); |
193 | 0 |
|
194 | 0 | transpose_out(B0, B2, B1, B3); |
195 | 0 |
|
196 | 0 | __m128i* out_mm = reinterpret_cast<__m128i*>(out); |
197 | 0 |
|
198 | 0 | _mm_storeu_si128(out_mm + 0, B0); |
199 | 0 | _mm_storeu_si128(out_mm + 1, B2); |
200 | 0 | _mm_storeu_si128(out_mm + 2, B1); |
201 | 0 | _mm_storeu_si128(out_mm + 3, B3); |
202 | 0 |
|
203 | 0 | CT::unpoison(in, 64); |
204 | 0 | CT::unpoison(out, 64); |
205 | 0 | CT::unpoison(EK, 52); |
206 | 0 | } |
207 | | |
208 | | } |