Coverage Report

Created: 2020-09-16 07:52

/src/botan/src/lib/block/idea/idea_sse2/idea_sse2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* IDEA in SSE2
3
* (C) 2009 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7
8
#include <botan/idea.h>
9
#include <botan/internal/ct_utils.h>
10
#include <emmintrin.h>
11
12
namespace Botan {
13
14
namespace {
15
16
BOTAN_FUNC_ISA("sse2")
17
inline __m128i mul(__m128i X, uint16_t K_16)
18
0
   {
19
0
   const __m128i zeros = _mm_set1_epi16(0);
20
0
   const __m128i ones = _mm_set1_epi16(1);
21
0
22
0
   const __m128i K = _mm_set1_epi16(K_16);
23
0
24
0
   const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
25
0
   const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
26
0
27
0
   const __m128i mul_lo = _mm_mullo_epi16(X, K);
28
0
   const __m128i mul_hi = _mm_mulhi_epu16(X, K);
29
0
30
0
   __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
31
0
32
   // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
33
0
   const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
34
0
   const __m128i cmp = _mm_min_epu8(
35
0
     _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
36
0
37
0
   T = _mm_add_epi16(T, cmp);
38
0
39
   /* Selection: if X[i] is zero then assign 1-K
40
                 if K is zero then assign 1-X[i]
41
42
      Could if() off value of K_16 for the second, but this gives a
43
      constant time implementation which is a nice bonus.
44
   */
45
0
46
0
   T = _mm_or_si128(
47
0
      _mm_andnot_si128(X_is_zero, T),
48
0
      _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
49
0
50
0
   T = _mm_or_si128(
51
0
      _mm_andnot_si128(K_is_zero, T),
52
0
      _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
53
0
54
0
   return T;
55
0
   }
56
57
/*
58
* 4x8 matrix transpose
59
*
60
* FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
61
* transpose_out doesn't need it. Something with the shuffle? Removing
62
* that extra unpack could easily save 3-4 cycles per block, and would
63
* also help a lot with register pressure on 32-bit x86
64
*/
65
BOTAN_FUNC_ISA("sse2")
66
void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
67
0
   {
68
0
   __m128i T0 = _mm_unpackhi_epi32(B0, B1);
69
0
   __m128i T1 = _mm_unpacklo_epi32(B0, B1);
70
0
   __m128i T2 = _mm_unpackhi_epi32(B2, B3);
71
0
   __m128i T3 = _mm_unpacklo_epi32(B2, B3);
72
0
73
0
   __m128i T4 = _mm_unpacklo_epi32(T0, T1);
74
0
   __m128i T5 = _mm_unpackhi_epi32(T0, T1);
75
0
   __m128i T6 = _mm_unpacklo_epi32(T2, T3);
76
0
   __m128i T7 = _mm_unpackhi_epi32(T2, T3);
77
0
78
0
   T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
79
0
   T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
80
0
   T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
81
0
   T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
82
0
83
0
   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
84
0
   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
85
0
   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
86
0
   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
87
0
88
0
   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
89
0
   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
90
0
   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
91
0
   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
92
0
93
0
   B0 = _mm_unpacklo_epi64(T0, T2);
94
0
   B1 = _mm_unpackhi_epi64(T0, T2);
95
0
   B2 = _mm_unpacklo_epi64(T1, T3);
96
0
   B3 = _mm_unpackhi_epi64(T1, T3);
97
0
   }
98
99
/*
100
* 4x8 matrix transpose (reverse)
101
*/
102
BOTAN_FUNC_ISA("sse2")
103
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
104
0
   {
105
0
   __m128i T0 = _mm_unpacklo_epi64(B0, B1);
106
0
   __m128i T1 = _mm_unpacklo_epi64(B2, B3);
107
0
   __m128i T2 = _mm_unpackhi_epi64(B0, B1);
108
0
   __m128i T3 = _mm_unpackhi_epi64(B2, B3);
109
0
110
0
   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
111
0
   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
112
0
   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
113
0
   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
114
0
115
0
   T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
116
0
   T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
117
0
   T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
118
0
   T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
119
0
120
0
   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
121
0
   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
122
0
   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
123
0
   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
124
0
125
0
   B0 = _mm_unpacklo_epi32(T0, T1);
126
0
   B1 = _mm_unpackhi_epi32(T0, T1);
127
0
   B2 = _mm_unpacklo_epi32(T2, T3);
128
0
   B3 = _mm_unpackhi_epi32(T2, T3);
129
0
   }
130
131
}
132
133
/*
134
* 8 wide IDEA encryption/decryption in SSE2
135
*/
136
BOTAN_FUNC_ISA("sse2")
137
void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) const
138
0
   {
139
0
   CT::poison(in, 64);
140
0
   CT::poison(out, 64);
141
0
   CT::poison(EK, 52);
142
0
143
0
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
144
0
145
0
   __m128i B0 = _mm_loadu_si128(in_mm + 0);
146
0
   __m128i B1 = _mm_loadu_si128(in_mm + 1);
147
0
   __m128i B2 = _mm_loadu_si128(in_mm + 2);
148
0
   __m128i B3 = _mm_loadu_si128(in_mm + 3);
149
0
150
0
   transpose_in(B0, B1, B2, B3);
151
0
152
   // byte swap
153
0
   B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
154
0
   B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
155
0
   B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
156
0
   B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
157
0
158
0
   for(size_t i = 0; i != 8; ++i)
159
0
      {
160
0
      B0 = mul(B0, EK[6*i+0]);
161
0
      B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
162
0
      B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
163
0
      B3 = mul(B3, EK[6*i+3]);
164
0
165
0
      __m128i T0 = B2;
166
0
      B2 = _mm_xor_si128(B2, B0);
167
0
      B2 = mul(B2, EK[6*i+4]);
168
0
169
0
      __m128i T1 = B1;
170
0
171
0
      B1 = _mm_xor_si128(B1, B3);
172
0
      B1 = _mm_add_epi16(B1, B2);
173
0
      B1 = mul(B1, EK[6*i+5]);
174
0
175
0
      B2 = _mm_add_epi16(B2, B1);
176
0
177
0
      B0 = _mm_xor_si128(B0, B1);
178
0
      B1 = _mm_xor_si128(B1, T0);
179
0
      B3 = _mm_xor_si128(B3, B2);
180
0
      B2 = _mm_xor_si128(B2, T1);
181
0
      }
182
0
183
0
   B0 = mul(B0, EK[48]);
184
0
   B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
185
0
   B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
186
0
   B3 = mul(B3, EK[51]);
187
0
188
   // byte swap
189
0
   B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
190
0
   B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
191
0
   B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
192
0
   B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
193
0
194
0
   transpose_out(B0, B2, B1, B3);
195
0
196
0
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
197
0
198
0
   _mm_storeu_si128(out_mm + 0, B0);
199
0
   _mm_storeu_si128(out_mm + 1, B2);
200
0
   _mm_storeu_si128(out_mm + 2, B1);
201
0
   _mm_storeu_si128(out_mm + 3, B3);
202
0
203
0
   CT::unpoison(in, 64);
204
0
   CT::unpoison(out, 64);
205
0
   CT::unpoison(EK, 52);
206
0
   }
207
208
}