/src/botan/src/lib/stream/chacha/chacha_simd32/chacha_simd32.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * (C) 2018 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #include <botan/chacha.h> |
8 | | #include <botan/internal/simd_32.h> |
9 | | |
10 | | namespace Botan { |
11 | | |
12 | | //static |
13 | | void ChaCha::chacha_simd32_x4(uint8_t output[64*4], uint32_t state[16], size_t rounds) |
14 | 0 | { |
15 | 0 | BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); |
16 | 0 | const SIMD_4x32 CTR0 = SIMD_4x32(0, 1, 2, 3); |
17 | 0 |
|
18 | 0 | const uint32_t C = 0xFFFFFFFF - state[12]; |
19 | 0 | const SIMD_4x32 CTR1 = SIMD_4x32(0, C < 1, C < 2, C < 3); |
20 | 0 |
|
21 | 0 | SIMD_4x32 R00 = SIMD_4x32::splat(state[ 0]); |
22 | 0 | SIMD_4x32 R01 = SIMD_4x32::splat(state[ 1]); |
23 | 0 | SIMD_4x32 R02 = SIMD_4x32::splat(state[ 2]); |
24 | 0 | SIMD_4x32 R03 = SIMD_4x32::splat(state[ 3]); |
25 | 0 | SIMD_4x32 R04 = SIMD_4x32::splat(state[ 4]); |
26 | 0 | SIMD_4x32 R05 = SIMD_4x32::splat(state[ 5]); |
27 | 0 | SIMD_4x32 R06 = SIMD_4x32::splat(state[ 6]); |
28 | 0 | SIMD_4x32 R07 = SIMD_4x32::splat(state[ 7]); |
29 | 0 | SIMD_4x32 R08 = SIMD_4x32::splat(state[ 8]); |
30 | 0 | SIMD_4x32 R09 = SIMD_4x32::splat(state[ 9]); |
31 | 0 | SIMD_4x32 R10 = SIMD_4x32::splat(state[10]); |
32 | 0 | SIMD_4x32 R11 = SIMD_4x32::splat(state[11]); |
33 | 0 | SIMD_4x32 R12 = SIMD_4x32::splat(state[12]) + CTR0; |
34 | 0 | SIMD_4x32 R13 = SIMD_4x32::splat(state[13]) + CTR1; |
35 | 0 | SIMD_4x32 R14 = SIMD_4x32::splat(state[14]); |
36 | 0 | SIMD_4x32 R15 = SIMD_4x32::splat(state[15]); |
37 | 0 |
|
38 | 0 | for(size_t r = 0; r != rounds / 2; ++r) |
39 | 0 | { |
40 | 0 | R00 += R04; |
41 | 0 | R01 += R05; |
42 | 0 | R02 += R06; |
43 | 0 | R03 += R07; |
44 | 0 |
|
45 | 0 | R12 ^= R00; |
46 | 0 | R13 ^= R01; |
47 | 0 | R14 ^= R02; |
48 | 0 | R15 ^= R03; |
49 | 0 |
|
50 | 0 | R12 = R12.rotl<16>(); |
51 | 0 | R13 = R13.rotl<16>(); |
52 | 0 | R14 = R14.rotl<16>(); |
53 | 0 | R15 = R15.rotl<16>(); |
54 | 0 |
|
55 | 0 | R08 += R12; |
56 | 0 | R09 += R13; |
57 | 0 | R10 += R14; |
58 | 0 | R11 += R15; |
59 | 0 |
|
60 | 0 | R04 ^= R08; |
61 | 0 | R05 ^= R09; |
62 | 0 | R06 ^= R10; |
63 | 0 | R07 ^= R11; |
64 | 0 |
|
65 | 0 | R04 = R04.rotl<12>(); |
66 | 0 | R05 = R05.rotl<12>(); |
67 | 0 | R06 = R06.rotl<12>(); |
68 | 0 | R07 = R07.rotl<12>(); |
69 | 0 |
|
70 | 0 | R00 += R04; |
71 | 0 | R01 += R05; |
72 | 0 | R02 += R06; |
73 | 0 | R03 += R07; |
74 | 0 |
|
75 | 0 | R12 ^= R00; |
76 | 0 | R13 ^= R01; |
77 | 0 | R14 ^= R02; |
78 | 0 | R15 ^= R03; |
79 | 0 |
|
80 | 0 | R12 = R12.rotl<8>(); |
81 | 0 | R13 = R13.rotl<8>(); |
82 | 0 | R14 = R14.rotl<8>(); |
83 | 0 | R15 = R15.rotl<8>(); |
84 | 0 |
|
85 | 0 | R08 += R12; |
86 | 0 | R09 += R13; |
87 | 0 | R10 += R14; |
88 | 0 | R11 += R15; |
89 | 0 |
|
90 | 0 | R04 ^= R08; |
91 | 0 | R05 ^= R09; |
92 | 0 | R06 ^= R10; |
93 | 0 | R07 ^= R11; |
94 | 0 |
|
95 | 0 | R04 = R04.rotl<7>(); |
96 | 0 | R05 = R05.rotl<7>(); |
97 | 0 | R06 = R06.rotl<7>(); |
98 | 0 | R07 = R07.rotl<7>(); |
99 | 0 |
|
100 | 0 | R00 += R05; |
101 | 0 | R01 += R06; |
102 | 0 | R02 += R07; |
103 | 0 | R03 += R04; |
104 | 0 |
|
105 | 0 | R15 ^= R00; |
106 | 0 | R12 ^= R01; |
107 | 0 | R13 ^= R02; |
108 | 0 | R14 ^= R03; |
109 | 0 |
|
110 | 0 | R15 = R15.rotl<16>(); |
111 | 0 | R12 = R12.rotl<16>(); |
112 | 0 | R13 = R13.rotl<16>(); |
113 | 0 | R14 = R14.rotl<16>(); |
114 | 0 |
|
115 | 0 | R10 += R15; |
116 | 0 | R11 += R12; |
117 | 0 | R08 += R13; |
118 | 0 | R09 += R14; |
119 | 0 |
|
120 | 0 | R05 ^= R10; |
121 | 0 | R06 ^= R11; |
122 | 0 | R07 ^= R08; |
123 | 0 | R04 ^= R09; |
124 | 0 |
|
125 | 0 | R05 = R05.rotl<12>(); |
126 | 0 | R06 = R06.rotl<12>(); |
127 | 0 | R07 = R07.rotl<12>(); |
128 | 0 | R04 = R04.rotl<12>(); |
129 | 0 |
|
130 | 0 | R00 += R05; |
131 | 0 | R01 += R06; |
132 | 0 | R02 += R07; |
133 | 0 | R03 += R04; |
134 | 0 |
|
135 | 0 | R15 ^= R00; |
136 | 0 | R12 ^= R01; |
137 | 0 | R13 ^= R02; |
138 | 0 | R14 ^= R03; |
139 | 0 |
|
140 | 0 | R15 = R15.rotl<8>(); |
141 | 0 | R12 = R12.rotl<8>(); |
142 | 0 | R13 = R13.rotl<8>(); |
143 | 0 | R14 = R14.rotl<8>(); |
144 | 0 |
|
145 | 0 | R10 += R15; |
146 | 0 | R11 += R12; |
147 | 0 | R08 += R13; |
148 | 0 | R09 += R14; |
149 | 0 |
|
150 | 0 | R05 ^= R10; |
151 | 0 | R06 ^= R11; |
152 | 0 | R07 ^= R08; |
153 | 0 | R04 ^= R09; |
154 | 0 |
|
155 | 0 | R05 = R05.rotl<7>(); |
156 | 0 | R06 = R06.rotl<7>(); |
157 | 0 | R07 = R07.rotl<7>(); |
158 | 0 | R04 = R04.rotl<7>(); |
159 | 0 | } |
160 | 0 |
|
161 | 0 | R00 += SIMD_4x32::splat(state[0]); |
162 | 0 | R01 += SIMD_4x32::splat(state[1]); |
163 | 0 | R02 += SIMD_4x32::splat(state[2]); |
164 | 0 | R03 += SIMD_4x32::splat(state[3]); |
165 | 0 | R04 += SIMD_4x32::splat(state[4]); |
166 | 0 | R05 += SIMD_4x32::splat(state[5]); |
167 | 0 | R06 += SIMD_4x32::splat(state[6]); |
168 | 0 | R07 += SIMD_4x32::splat(state[7]); |
169 | 0 | R08 += SIMD_4x32::splat(state[8]); |
170 | 0 | R09 += SIMD_4x32::splat(state[9]); |
171 | 0 | R10 += SIMD_4x32::splat(state[10]); |
172 | 0 | R11 += SIMD_4x32::splat(state[11]); |
173 | 0 | R12 += SIMD_4x32::splat(state[12]) + CTR0; |
174 | 0 | R13 += SIMD_4x32::splat(state[13]) + CTR1; |
175 | 0 | R14 += SIMD_4x32::splat(state[14]); |
176 | 0 | R15 += SIMD_4x32::splat(state[15]); |
177 | 0 |
|
178 | 0 | SIMD_4x32::transpose(R00, R01, R02, R03); |
179 | 0 | SIMD_4x32::transpose(R04, R05, R06, R07); |
180 | 0 | SIMD_4x32::transpose(R08, R09, R10, R11); |
181 | 0 | SIMD_4x32::transpose(R12, R13, R14, R15); |
182 | 0 |
|
183 | 0 | R00.store_le(output + 0*16); |
184 | 0 | R04.store_le(output + 1*16); |
185 | 0 | R08.store_le(output + 2*16); |
186 | 0 | R12.store_le(output + 3*16); |
187 | 0 | R01.store_le(output + 4*16); |
188 | 0 | R05.store_le(output + 5*16); |
189 | 0 | R09.store_le(output + 6*16); |
190 | 0 | R13.store_le(output + 7*16); |
191 | 0 | R02.store_le(output + 8*16); |
192 | 0 | R06.store_le(output + 9*16); |
193 | 0 | R10.store_le(output + 10*16); |
194 | 0 | R14.store_le(output + 11*16); |
195 | 0 | R03.store_le(output + 12*16); |
196 | 0 | R07.store_le(output + 13*16); |
197 | 0 | R11.store_le(output + 14*16); |
198 | 0 | R15.store_le(output + 15*16); |
199 | 0 |
|
200 | 0 | state[12] += 4; |
201 | 0 | if(state[12] < 4) |
202 | 0 | state[13]++; |
203 | 0 | } |
204 | | |
205 | | } |