Coverage Report

Created: 2020-09-16 07:52

/src/botan/src/lib/stream/chacha/chacha_avx2/chacha_avx2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* (C) 2018 Jack Lloyd
3
*
4
* Botan is released under the Simplified BSD License (see license.txt)
5
*/
6
7
#include <botan/chacha.h>
8
#include <botan/internal/simd_avx2.h>
9
10
namespace Botan {
11
12
//static
13
BOTAN_FUNC_ISA("avx2")
14
void ChaCha::chacha_avx2_x8(uint8_t output[64*8], uint32_t state[16], size_t rounds)
15
52.4k
   {
16
52.4k
   SIMD_8x32::reset_registers();
17
52.4k
18
52.4k
   BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
19
52.4k
   const SIMD_8x32 CTR0 = SIMD_8x32(0, 1, 2, 3, 4, 5, 6, 7);
20
52.4k
21
52.4k
   const uint32_t C = 0xFFFFFFFF - state[12];
22
52.4k
   const SIMD_8x32 CTR1 = SIMD_8x32(0, C < 1, C < 2, C < 3, C < 4, C < 5, C < 6, C < 7);
23
52.4k
24
52.4k
   SIMD_8x32 R00 = SIMD_8x32::splat(state[ 0]);
25
52.4k
   SIMD_8x32 R01 = SIMD_8x32::splat(state[ 1]);
26
52.4k
   SIMD_8x32 R02 = SIMD_8x32::splat(state[ 2]);
27
52.4k
   SIMD_8x32 R03 = SIMD_8x32::splat(state[ 3]);
28
52.4k
   SIMD_8x32 R04 = SIMD_8x32::splat(state[ 4]);
29
52.4k
   SIMD_8x32 R05 = SIMD_8x32::splat(state[ 5]);
30
52.4k
   SIMD_8x32 R06 = SIMD_8x32::splat(state[ 6]);
31
52.4k
   SIMD_8x32 R07 = SIMD_8x32::splat(state[ 7]);
32
52.4k
   SIMD_8x32 R08 = SIMD_8x32::splat(state[ 8]);
33
52.4k
   SIMD_8x32 R09 = SIMD_8x32::splat(state[ 9]);
34
52.4k
   SIMD_8x32 R10 = SIMD_8x32::splat(state[10]);
35
52.4k
   SIMD_8x32 R11 = SIMD_8x32::splat(state[11]);
36
52.4k
   SIMD_8x32 R12 = SIMD_8x32::splat(state[12]) + CTR0;
37
52.4k
   SIMD_8x32 R13 = SIMD_8x32::splat(state[13]) + CTR1;
38
52.4k
   SIMD_8x32 R14 = SIMD_8x32::splat(state[14]);
39
52.4k
   SIMD_8x32 R15 = SIMD_8x32::splat(state[15]);
40
52.4k
41
576k
   for(size_t r = 0; r != rounds / 2; ++r)
42
524k
      {
43
524k
      R00 += R04;
44
524k
      R01 += R05;
45
524k
      R02 += R06;
46
524k
      R03 += R07;
47
524k
48
524k
      R12 ^= R00;
49
524k
      R13 ^= R01;
50
524k
      R14 ^= R02;
51
524k
      R15 ^= R03;
52
524k
53
524k
      R12 = R12.rotl<16>();
54
524k
      R13 = R13.rotl<16>();
55
524k
      R14 = R14.rotl<16>();
56
524k
      R15 = R15.rotl<16>();
57
524k
58
524k
      R08 += R12;
59
524k
      R09 += R13;
60
524k
      R10 += R14;
61
524k
      R11 += R15;
62
524k
63
524k
      R04 ^= R08;
64
524k
      R05 ^= R09;
65
524k
      R06 ^= R10;
66
524k
      R07 ^= R11;
67
524k
68
524k
      R04 = R04.rotl<12>();
69
524k
      R05 = R05.rotl<12>();
70
524k
      R06 = R06.rotl<12>();
71
524k
      R07 = R07.rotl<12>();
72
524k
73
524k
      R00 += R04;
74
524k
      R01 += R05;
75
524k
      R02 += R06;
76
524k
      R03 += R07;
77
524k
78
524k
      R12 ^= R00;
79
524k
      R13 ^= R01;
80
524k
      R14 ^= R02;
81
524k
      R15 ^= R03;
82
524k
83
524k
      R12 = R12.rotl<8>();
84
524k
      R13 = R13.rotl<8>();
85
524k
      R14 = R14.rotl<8>();
86
524k
      R15 = R15.rotl<8>();
87
524k
88
524k
      R08 += R12;
89
524k
      R09 += R13;
90
524k
      R10 += R14;
91
524k
      R11 += R15;
92
524k
93
524k
      R04 ^= R08;
94
524k
      R05 ^= R09;
95
524k
      R06 ^= R10;
96
524k
      R07 ^= R11;
97
524k
98
524k
      R04 = R04.rotl<7>();
99
524k
      R05 = R05.rotl<7>();
100
524k
      R06 = R06.rotl<7>();
101
524k
      R07 = R07.rotl<7>();
102
524k
103
524k
      R00 += R05;
104
524k
      R01 += R06;
105
524k
      R02 += R07;
106
524k
      R03 += R04;
107
524k
108
524k
      R15 ^= R00;
109
524k
      R12 ^= R01;
110
524k
      R13 ^= R02;
111
524k
      R14 ^= R03;
112
524k
113
524k
      R15 = R15.rotl<16>();
114
524k
      R12 = R12.rotl<16>();
115
524k
      R13 = R13.rotl<16>();
116
524k
      R14 = R14.rotl<16>();
117
524k
118
524k
      R10 += R15;
119
524k
      R11 += R12;
120
524k
      R08 += R13;
121
524k
      R09 += R14;
122
524k
123
524k
      R05 ^= R10;
124
524k
      R06 ^= R11;
125
524k
      R07 ^= R08;
126
524k
      R04 ^= R09;
127
524k
128
524k
      R05 = R05.rotl<12>();
129
524k
      R06 = R06.rotl<12>();
130
524k
      R07 = R07.rotl<12>();
131
524k
      R04 = R04.rotl<12>();
132
524k
133
524k
      R00 += R05;
134
524k
      R01 += R06;
135
524k
      R02 += R07;
136
524k
      R03 += R04;
137
524k
138
524k
      R15 ^= R00;
139
524k
      R12 ^= R01;
140
524k
      R13 ^= R02;
141
524k
      R14 ^= R03;
142
524k
143
524k
      R15 = R15.rotl<8>();
144
524k
      R12 = R12.rotl<8>();
145
524k
      R13 = R13.rotl<8>();
146
524k
      R14 = R14.rotl<8>();
147
524k
148
524k
      R10 += R15;
149
524k
      R11 += R12;
150
524k
      R08 += R13;
151
524k
      R09 += R14;
152
524k
153
524k
      R05 ^= R10;
154
524k
      R06 ^= R11;
155
524k
      R07 ^= R08;
156
524k
      R04 ^= R09;
157
524k
158
524k
      R05 = R05.rotl<7>();
159
524k
      R06 = R06.rotl<7>();
160
524k
      R07 = R07.rotl<7>();
161
524k
      R04 = R04.rotl<7>();
162
524k
      }
163
52.4k
164
52.4k
   R00 += SIMD_8x32::splat(state[0]);
165
52.4k
   R01 += SIMD_8x32::splat(state[1]);
166
52.4k
   R02 += SIMD_8x32::splat(state[2]);
167
52.4k
   R03 += SIMD_8x32::splat(state[3]);
168
52.4k
   R04 += SIMD_8x32::splat(state[4]);
169
52.4k
   R05 += SIMD_8x32::splat(state[5]);
170
52.4k
   R06 += SIMD_8x32::splat(state[6]);
171
52.4k
   R07 += SIMD_8x32::splat(state[7]);
172
52.4k
   R08 += SIMD_8x32::splat(state[8]);
173
52.4k
   R09 += SIMD_8x32::splat(state[9]);
174
52.4k
   R10 += SIMD_8x32::splat(state[10]);
175
52.4k
   R11 += SIMD_8x32::splat(state[11]);
176
52.4k
   R12 += SIMD_8x32::splat(state[12]) + CTR0;
177
52.4k
   R13 += SIMD_8x32::splat(state[13]) + CTR1;
178
52.4k
   R14 += SIMD_8x32::splat(state[14]);
179
52.4k
   R15 += SIMD_8x32::splat(state[15]);
180
52.4k
181
52.4k
   SIMD_8x32::transpose(R00, R01, R02, R03, R04, R05, R06, R07);
182
52.4k
   SIMD_8x32::transpose(R08, R09, R10, R11, R12, R13, R14, R15);
183
52.4k
184
52.4k
   R00.store_le(output);
185
52.4k
   R08.store_le(output + 32*1);
186
52.4k
   R01.store_le(output + 32*2);
187
52.4k
   R09.store_le(output + 32*3);
188
52.4k
   R02.store_le(output + 32*4);
189
52.4k
   R10.store_le(output + 32*5);
190
52.4k
   R03.store_le(output + 32*6);
191
52.4k
   R11.store_le(output + 32*7);
192
52.4k
   R04.store_le(output + 32*8);
193
52.4k
   R12.store_le(output + 32*9);
194
52.4k
   R05.store_le(output + 32*10);
195
52.4k
   R13.store_le(output + 32*11);
196
52.4k
   R06.store_le(output + 32*12);
197
52.4k
   R14.store_le(output + 32*13);
198
52.4k
   R07.store_le(output + 32*14);
199
52.4k
   R15.store_le(output + 32*15);
200
52.4k
201
52.4k
   SIMD_8x32::zero_registers();
202
52.4k
203
52.4k
   state[12] += 8;
204
52.4k
   if(state[12] < 8)
205
0
      state[13]++;
206
52.4k
   }
207
}