Coverage Report

Created: 2019-09-11 14:12

/src/botan/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* (C) 2018 Jack Lloyd
3
*
4
* Botan is released under the Simplified BSD License (see license.txt)
5
*/
6
7
#include <botan/chacha.h>
8
#include <botan/internal/simd_avx2.h>
9
10
namespace Botan {
11
12
//static
13
BOTAN_FUNC_ISA("avx2")
14
void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds)
15
35.5k
   {
16
35.5k
   SIMD_8x32::reset_registers();
17
35.5k
18
35.5k
   BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
19
35.5k
   const SIMD_8x32 CTR0 = SIMD_8x32(0, 1, 2, 3, 4, 5, 6, 7);
20
35.5k
21
35.5k
   const uint32_t C = 0xFFFFFFFF - state[12];
22
35.5k
   const SIMD_8x32 CTR1 = SIMD_8x32(0, C < 1, C < 2, C < 3, C < 4, C < 5, C < 6, C < 7);
23
35.5k
24
35.5k
   SIMD_8x32 R00 = SIMD_8x32::splat(state[ 0]);
25
35.5k
   SIMD_8x32 R01 = SIMD_8x32::splat(state[ 1]);
26
35.5k
   SIMD_8x32 R02 = SIMD_8x32::splat(state[ 2]);
27
35.5k
   SIMD_8x32 R03 = SIMD_8x32::splat(state[ 3]);
28
35.5k
   SIMD_8x32 R04 = SIMD_8x32::splat(state[ 4]);
29
35.5k
   SIMD_8x32 R05 = SIMD_8x32::splat(state[ 5]);
30
35.5k
   SIMD_8x32 R06 = SIMD_8x32::splat(state[ 6]);
31
35.5k
   SIMD_8x32 R07 = SIMD_8x32::splat(state[ 7]);
32
35.5k
   SIMD_8x32 R08 = SIMD_8x32::splat(state[ 8]);
33
35.5k
   SIMD_8x32 R09 = SIMD_8x32::splat(state[ 9]);
34
35.5k
   SIMD_8x32 R10 = SIMD_8x32::splat(state[10]);
35
35.5k
   SIMD_8x32 R11 = SIMD_8x32::splat(state[11]);
36
35.5k
   SIMD_8x32 R12 = SIMD_8x32::splat(state[12]) + CTR0;
37
35.5k
   SIMD_8x32 R13 = SIMD_8x32::splat(state[13]) + CTR1;
38
35.5k
   SIMD_8x32 R14 = SIMD_8x32::splat(state[14]);
39
35.5k
   SIMD_8x32 R15 = SIMD_8x32::splat(state[15]);
40
35.5k
41
390k
   for(size_t r = 0; r != rounds / 2; ++r)
42
355k
      {
43
355k
      R00 += R04;
44
355k
      R01 += R05;
45
355k
      R02 += R06;
46
355k
      R03 += R07;
47
355k
48
355k
      R12 ^= R00;
49
355k
      R13 ^= R01;
50
355k
      R14 ^= R02;
51
355k
      R15 ^= R03;
52
355k
53
355k
      R12 = R12.rotl<16>();
54
355k
      R13 = R13.rotl<16>();
55
355k
      R14 = R14.rotl<16>();
56
355k
      R15 = R15.rotl<16>();
57
355k
58
355k
      R08 += R12;
59
355k
      R09 += R13;
60
355k
      R10 += R14;
61
355k
      R11 += R15;
62
355k
63
355k
      R04 ^= R08;
64
355k
      R05 ^= R09;
65
355k
      R06 ^= R10;
66
355k
      R07 ^= R11;
67
355k
68
355k
      R04 = R04.rotl<12>();
69
355k
      R05 = R05.rotl<12>();
70
355k
      R06 = R06.rotl<12>();
71
355k
      R07 = R07.rotl<12>();
72
355k
73
355k
      R00 += R04;
74
355k
      R01 += R05;
75
355k
      R02 += R06;
76
355k
      R03 += R07;
77
355k
78
355k
      R12 ^= R00;
79
355k
      R13 ^= R01;
80
355k
      R14 ^= R02;
81
355k
      R15 ^= R03;
82
355k
83
355k
      R12 = R12.rotl<8>();
84
355k
      R13 = R13.rotl<8>();
85
355k
      R14 = R14.rotl<8>();
86
355k
      R15 = R15.rotl<8>();
87
355k
88
355k
      R08 += R12;
89
355k
      R09 += R13;
90
355k
      R10 += R14;
91
355k
      R11 += R15;
92
355k
93
355k
      R04 ^= R08;
94
355k
      R05 ^= R09;
95
355k
      R06 ^= R10;
96
355k
      R07 ^= R11;
97
355k
98
355k
      R04 = R04.rotl<7>();
99
355k
      R05 = R05.rotl<7>();
100
355k
      R06 = R06.rotl<7>();
101
355k
      R07 = R07.rotl<7>();
102
355k
103
355k
      R00 += R05;
104
355k
      R01 += R06;
105
355k
      R02 += R07;
106
355k
      R03 += R04;
107
355k
108
355k
      R15 ^= R00;
109
355k
      R12 ^= R01;
110
355k
      R13 ^= R02;
111
355k
      R14 ^= R03;
112
355k
113
355k
      R15 = R15.rotl<16>();
114
355k
      R12 = R12.rotl<16>();
115
355k
      R13 = R13.rotl<16>();
116
355k
      R14 = R14.rotl<16>();
117
355k
118
355k
      R10 += R15;
119
355k
      R11 += R12;
120
355k
      R08 += R13;
121
355k
      R09 += R14;
122
355k
123
355k
      R05 ^= R10;
124
355k
      R06 ^= R11;
125
355k
      R07 ^= R08;
126
355k
      R04 ^= R09;
127
355k
128
355k
      R05 = R05.rotl<12>();
129
355k
      R06 = R06.rotl<12>();
130
355k
      R07 = R07.rotl<12>();
131
355k
      R04 = R04.rotl<12>();
132
355k
133
355k
      R00 += R05;
134
355k
      R01 += R06;
135
355k
      R02 += R07;
136
355k
      R03 += R04;
137
355k
138
355k
      R15 ^= R00;
139
355k
      R12 ^= R01;
140
355k
      R13 ^= R02;
141
355k
      R14 ^= R03;
142
355k
143
355k
      R15 = R15.rotl<8>();
144
355k
      R12 = R12.rotl<8>();
145
355k
      R13 = R13.rotl<8>();
146
355k
      R14 = R14.rotl<8>();
147
355k
148
355k
      R10 += R15;
149
355k
      R11 += R12;
150
355k
      R08 += R13;
151
355k
      R09 += R14;
152
355k
153
355k
      R05 ^= R10;
154
355k
      R06 ^= R11;
155
355k
      R07 ^= R08;
156
355k
      R04 ^= R09;
157
355k
158
355k
      R05 = R05.rotl<7>();
159
355k
      R06 = R06.rotl<7>();
160
355k
      R07 = R07.rotl<7>();
161
355k
      R04 = R04.rotl<7>();
162
355k
      }
163
35.5k
164
35.5k
   R00 += SIMD_8x32::splat(state[0]);
165
35.5k
   R01 += SIMD_8x32::splat(state[1]);
166
35.5k
   R02 += SIMD_8x32::splat(state[2]);
167
35.5k
   R03 += SIMD_8x32::splat(state[3]);
168
35.5k
   R04 += SIMD_8x32::splat(state[4]);
169
35.5k
   R05 += SIMD_8x32::splat(state[5]);
170
35.5k
   R06 += SIMD_8x32::splat(state[6]);
171
35.5k
   R07 += SIMD_8x32::splat(state[7]);
172
35.5k
   R08 += SIMD_8x32::splat(state[8]);
173
35.5k
   R09 += SIMD_8x32::splat(state[9]);
174
35.5k
   R10 += SIMD_8x32::splat(state[10]);
175
35.5k
   R11 += SIMD_8x32::splat(state[11]);
176
35.5k
   R12 += SIMD_8x32::splat(state[12]) + CTR0;
177
35.5k
   R13 += SIMD_8x32::splat(state[13]) + CTR1;
178
35.5k
   R14 += SIMD_8x32::splat(state[14]);
179
35.5k
   R15 += SIMD_8x32::splat(state[15]);
180
35.5k
181
35.5k
   SIMD_8x32::transpose(R00, R01, R02, R03);
182
35.5k
   SIMD_8x32::transpose(R04, R05, R06, R07);
183
35.5k
   SIMD_8x32::transpose(R08, R09, R10, R11);
184
35.5k
   SIMD_8x32::transpose(R12, R13, R14, R15);
185
35.5k
186
35.5k
   __m256i* output_mm = reinterpret_cast<__m256i*>(output);
187
35.5k
188
35.5k
   _mm256_storeu_si256(output_mm     , _mm256_permute2x128_si256(R00.handle(), R04.handle(), 0 + (2 << 4)));
189
35.5k
   _mm256_storeu_si256(output_mm +  1, _mm256_permute2x128_si256(R08.handle(), R12.handle(), 0 + (2 << 4)));
190
35.5k
   _mm256_storeu_si256(output_mm +  2, _mm256_permute2x128_si256(R01.handle(), R05.handle(), 0 + (2 << 4)));
191
35.5k
   _mm256_storeu_si256(output_mm +  3, _mm256_permute2x128_si256(R09.handle(), R13.handle(), 0 + (2 << 4)));
192
35.5k
   _mm256_storeu_si256(output_mm +  4, _mm256_permute2x128_si256(R02.handle(), R06.handle(), 0 + (2 << 4)));
193
35.5k
   _mm256_storeu_si256(output_mm +  5, _mm256_permute2x128_si256(R10.handle(), R14.handle(), 0 + (2 << 4)));
194
35.5k
   _mm256_storeu_si256(output_mm +  6, _mm256_permute2x128_si256(R03.handle(), R07.handle(), 0 + (2 << 4)));
195
35.5k
   _mm256_storeu_si256(output_mm +  7, _mm256_permute2x128_si256(R11.handle(), R15.handle(), 0 + (2 << 4)));
196
35.5k
197
35.5k
   _mm256_storeu_si256(output_mm +  8, _mm256_permute2x128_si256(R00.handle(), R04.handle(), 1 + (3 << 4)));
198
35.5k
   _mm256_storeu_si256(output_mm +  9, _mm256_permute2x128_si256(R08.handle(), R12.handle(), 1 + (3 << 4)));
199
35.5k
   _mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(R01.handle(), R05.handle(), 1 + (3 << 4)));
200
35.5k
   _mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(R09.handle(), R13.handle(), 1 + (3 << 4)));
201
35.5k
   _mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(R02.handle(), R06.handle(), 1 + (3 << 4)));
202
35.5k
   _mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(R10.handle(), R14.handle(), 1 + (3 << 4)));
203
35.5k
   _mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(R03.handle(), R07.handle(), 1 + (3 << 4)));
204
35.5k
   _mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(R11.handle(), R15.handle(), 1 + (3 << 4)));
205
35.5k
206
35.5k
   SIMD_8x32::zero_registers();
207
35.5k
208
35.5k
   state[12] += 8;
209
35.5k
   if(state[12] < 8)
210
0
      state[13]++;
211
35.5k
   }
212
}