/src/botan/src/lib/pbkdf/argon2/argon2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * (C) 2018,2019 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #include <botan/argon2.h> |
8 | | #include <botan/loadstor.h> |
9 | | #include <botan/hash.h> |
10 | | #include <botan/mem_ops.h> |
11 | | #include <botan/rotate.h> |
12 | | #include <botan/exceptn.h> |
13 | | |
14 | | namespace Botan { |
15 | | |
16 | | namespace { |
17 | | |
18 | | static const size_t SYNC_POINTS = 4; |
19 | | |
20 | | secure_vector<uint8_t> argon2_H0(HashFunction& blake2b, |
21 | | size_t output_len, |
22 | | const char* password, size_t password_len, |
23 | | const uint8_t salt[], size_t salt_len, |
24 | | const uint8_t key[], size_t key_len, |
25 | | const uint8_t ad[], size_t ad_len, |
26 | | size_t y, size_t p, size_t M, size_t t) |
27 | 0 | { |
28 | 0 | const uint8_t v = 19; // Argon2 version code |
29 | 0 |
|
30 | 0 | blake2b.update_le(static_cast<uint32_t>(p)); |
31 | 0 | blake2b.update_le(static_cast<uint32_t>(output_len)); |
32 | 0 | blake2b.update_le(static_cast<uint32_t>(M)); |
33 | 0 | blake2b.update_le(static_cast<uint32_t>(t)); |
34 | 0 | blake2b.update_le(static_cast<uint32_t>(v)); |
35 | 0 | blake2b.update_le(static_cast<uint32_t>(y)); |
36 | 0 |
|
37 | 0 | blake2b.update_le(static_cast<uint32_t>(password_len)); |
38 | 0 | blake2b.update(cast_char_ptr_to_uint8(password), password_len); |
39 | 0 |
|
40 | 0 | blake2b.update_le(static_cast<uint32_t>(salt_len)); |
41 | 0 | blake2b.update(salt, salt_len); |
42 | 0 |
|
43 | 0 | blake2b.update_le(static_cast<uint32_t>(key_len)); |
44 | 0 | blake2b.update(key, key_len); |
45 | 0 |
|
46 | 0 | blake2b.update_le(static_cast<uint32_t>(ad_len)); |
47 | 0 | blake2b.update(ad, ad_len); |
48 | 0 |
|
49 | 0 | return blake2b.final(); |
50 | 0 | } |
51 | | |
52 | | void Htick(secure_vector<uint8_t>& T, |
53 | | uint8_t output[], |
54 | | size_t output_len, |
55 | | HashFunction& blake2b, |
56 | | const secure_vector<uint8_t>& H0, |
57 | | size_t p0, size_t p1) |
58 | 0 | { |
59 | 0 | BOTAN_ASSERT_NOMSG(output_len % 64 == 0); |
60 | 0 |
|
61 | 0 | blake2b.update_le(static_cast<uint32_t>(output_len)); |
62 | 0 | blake2b.update(H0); |
63 | 0 | blake2b.update_le(static_cast<uint32_t>(p0)); |
64 | 0 | blake2b.update_le(static_cast<uint32_t>(p1)); |
65 | 0 |
|
66 | 0 | blake2b.final(&T[0]); |
67 | 0 |
|
68 | 0 | while(output_len > 64) |
69 | 0 | { |
70 | 0 | copy_mem(output, &T[0], 32); |
71 | 0 | output_len -= 32; |
72 | 0 | output += 32; |
73 | 0 |
|
74 | 0 | blake2b.update(T); |
75 | 0 | blake2b.final(&T[0]); |
76 | 0 | } |
77 | 0 |
|
78 | 0 | if(output_len > 0) |
79 | 0 | copy_mem(output, &T[0], output_len); |
80 | 0 | } |
81 | | |
82 | | void extract_key(uint8_t output[], size_t output_len, |
83 | | const secure_vector<uint64_t>& B, |
84 | | size_t memory, size_t threads) |
85 | 0 | { |
86 | 0 | const size_t lanes = memory / threads; |
87 | 0 |
|
88 | 0 | secure_vector<uint64_t> sum(128); |
89 | 0 |
|
90 | 0 | for(size_t lane = 0; lane != threads; ++lane) |
91 | 0 | { |
92 | 0 | size_t start = 128*(lane * lanes + lanes - 1); |
93 | 0 | size_t end = 128*(lane * lanes + lanes); |
94 | 0 |
|
95 | 0 | for(size_t j = start; j != end; ++j) |
96 | 0 | { |
97 | 0 | sum[j % 128] ^= B[j]; |
98 | 0 | } |
99 | 0 | } |
100 | 0 |
|
101 | 0 | secure_vector<uint8_t> sum8(1024); |
102 | 0 | copy_out_le(sum8.data(), 1024, sum.data()); |
103 | 0 |
|
104 | 0 | if(output_len <= 64) |
105 | 0 | { |
106 | 0 | std::unique_ptr<HashFunction> blake2b = HashFunction::create_or_throw("BLAKE2b(" + std::to_string(output_len*8) + ")"); |
107 | 0 | blake2b->update_le(static_cast<uint32_t>(output_len)); |
108 | 0 | blake2b->update(sum8.data(), sum8.size()); |
109 | 0 | blake2b->final(output); |
110 | 0 | } |
111 | 0 | else |
112 | 0 | { |
113 | 0 | secure_vector<uint8_t> T(64); |
114 | 0 |
|
115 | 0 | std::unique_ptr<HashFunction> blake2b = HashFunction::create_or_throw("BLAKE2b(512)"); |
116 | 0 | blake2b->update_le(static_cast<uint32_t>(output_len)); |
117 | 0 | blake2b->update(sum8.data(), sum8.size()); |
118 | 0 | blake2b->final(&T[0]); |
119 | 0 |
|
120 | 0 | while(output_len > 64) |
121 | 0 | { |
122 | 0 | copy_mem(output, &T[0], 32); |
123 | 0 | output_len -= 32; |
124 | 0 | output += 32; |
125 | 0 |
|
126 | 0 | if(output_len > 64) |
127 | 0 | { |
128 | 0 | blake2b->update(T); |
129 | 0 | blake2b->final(&T[0]); |
130 | 0 | } |
131 | 0 | } |
132 | 0 |
|
133 | 0 | if(output_len == 64) |
134 | 0 | { |
135 | 0 | blake2b->update(T); |
136 | 0 | blake2b->final(output); |
137 | 0 | } |
138 | 0 | else |
139 | 0 | { |
140 | 0 | std::unique_ptr<HashFunction> blake2b_f = HashFunction::create_or_throw("BLAKE2b(" + std::to_string(output_len*8) + ")"); |
141 | 0 | blake2b_f->update(T); |
142 | 0 | blake2b_f->final(output); |
143 | 0 | } |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | | void init_blocks(secure_vector<uint64_t>& B, |
148 | | HashFunction& blake2b, |
149 | | const secure_vector<uint8_t>& H0, |
150 | | size_t memory, |
151 | | size_t threads) |
152 | 0 | { |
153 | 0 | BOTAN_ASSERT_NOMSG(B.size() >= threads*256); |
154 | 0 |
|
155 | 0 | secure_vector<uint8_t> H(1024); |
156 | 0 | secure_vector<uint8_t> T(blake2b.output_length()); |
157 | 0 |
|
158 | 0 | for(size_t i = 0; i != threads; ++i) |
159 | 0 | { |
160 | 0 | const size_t B_off = i * (memory / threads); |
161 | 0 |
|
162 | 0 | BOTAN_ASSERT_NOMSG(B.size() >= 128*(B_off+2)); |
163 | 0 |
|
164 | 0 | Htick(T, &H[0], H.size(), blake2b, H0, 0, i); |
165 | 0 |
|
166 | 0 | for(size_t j = 0; j != 128; ++j) |
167 | 0 | { |
168 | 0 | B[128*B_off+j] = load_le<uint64_t>(H.data(), j); |
169 | 0 | } |
170 | 0 |
|
171 | 0 | Htick(T, &H[0], H.size(), blake2b, H0, 1, i); |
172 | 0 |
|
173 | 0 | for(size_t j = 0; j != 128; ++j) |
174 | 0 | { |
175 | 0 | B[128*(B_off+1)+j] = load_le<uint64_t>(H.data(), j); |
176 | 0 | } |
177 | 0 | } |
178 | 0 | } |
179 | | |
180 | | inline void blamka_G(uint64_t& A, uint64_t& B, uint64_t& C, uint64_t& D) |
181 | 0 | { |
182 | 0 | A += B + (static_cast<uint64_t>(2) * static_cast<uint32_t>(A)) * static_cast<uint32_t>(B); |
183 | 0 | D = rotr<32>(A ^ D); |
184 | 0 |
|
185 | 0 | C += D + (static_cast<uint64_t>(2) * static_cast<uint32_t>(C)) * static_cast<uint32_t>(D); |
186 | 0 | B = rotr<24>(B ^ C); |
187 | 0 |
|
188 | 0 | A += B + (static_cast<uint64_t>(2) * static_cast<uint32_t>(A)) * static_cast<uint32_t>(B); |
189 | 0 | D = rotr<16>(A ^ D); |
190 | 0 |
|
191 | 0 | C += D + (static_cast<uint64_t>(2) * static_cast<uint32_t>(C)) * static_cast<uint32_t>(D); |
192 | 0 | B = rotr<63>(B ^ C); |
193 | 0 | } |
194 | | |
195 | | inline void blamka(uint64_t& V0, uint64_t& V1, uint64_t& V2, uint64_t& V3, |
196 | | uint64_t& V4, uint64_t& V5, uint64_t& V6, uint64_t& V7, |
197 | | uint64_t& V8, uint64_t& V9, uint64_t& VA, uint64_t& VB, |
198 | | uint64_t& VC, uint64_t& VD, uint64_t& VE, uint64_t& VF) |
199 | 0 | { |
200 | 0 | blamka_G(V0, V4, V8, VC); |
201 | 0 | blamka_G(V1, V5, V9, VD); |
202 | 0 | blamka_G(V2, V6, VA, VE); |
203 | 0 | blamka_G(V3, V7, VB, VF); |
204 | 0 |
|
205 | 0 | blamka_G(V0, V5, VA, VF); |
206 | 0 | blamka_G(V1, V6, VB, VC); |
207 | 0 | blamka_G(V2, V7, V8, VD); |
208 | 0 | blamka_G(V3, V4, V9, VE); |
209 | 0 | } |
210 | | |
211 | | void process_block_xor(secure_vector<uint64_t>& T, |
212 | | secure_vector<uint64_t>& B, |
213 | | size_t offset, |
214 | | size_t prev, |
215 | | size_t new_offset) |
216 | 0 | { |
217 | 0 | for(size_t i = 0; i != 128; ++i) |
218 | 0 | T[i] = B[128*prev+i] ^ B[128*new_offset+i]; |
219 | 0 |
|
220 | 0 | for(size_t i = 0; i != 128; i += 16) |
221 | 0 | { |
222 | 0 | blamka(T[i+ 0], T[i+ 1], T[i+ 2], T[i+ 3], |
223 | 0 | T[i+ 4], T[i+ 5], T[i+ 6], T[i+ 7], |
224 | 0 | T[i+ 8], T[i+ 9], T[i+10], T[i+11], |
225 | 0 | T[i+12], T[i+13], T[i+14], T[i+15]); |
226 | 0 | } |
227 | 0 |
|
228 | 0 | for(size_t i = 0; i != 128 / 8; i += 2) |
229 | 0 | { |
230 | 0 | blamka(T[ i], T[ i+1], T[ 16+i], T[ 16+i+1], |
231 | 0 | T[ 32+i], T[ 32+i+1], T[ 48+i], T[ 48+i+1], |
232 | 0 | T[ 64+i], T[ 64+i+1], T[ 80+i], T[ 80+i+1], |
233 | 0 | T[ 96+i], T[ 96+i+1], T[112+i], T[112+i+1]); |
234 | 0 | } |
235 | 0 |
|
236 | 0 | for(size_t i = 0; i != 128; ++i) |
237 | 0 | B[128*offset + i] ^= T[i] ^ B[128*prev+i] ^ B[128*new_offset+i]; |
238 | 0 | } |
239 | | |
240 | | void gen_2i_addresses(secure_vector<uint64_t>& T, secure_vector<uint64_t>& B, |
241 | | size_t n, size_t lane, size_t slice, size_t memory, |
242 | | size_t time, size_t mode, size_t cnt) |
243 | 0 | { |
244 | 0 | BOTAN_ASSERT_NOMSG(B.size() == 128); |
245 | 0 | BOTAN_ASSERT_NOMSG(T.size() == 128); |
246 | 0 |
|
247 | 0 | clear_mem(B.data(), B.size()); |
248 | 0 | B[0] = n; |
249 | 0 | B[1] = lane; |
250 | 0 | B[2] = slice; |
251 | 0 | B[3] = memory; |
252 | 0 | B[4] = time; |
253 | 0 | B[5] = mode; |
254 | 0 | B[6] = cnt; |
255 | 0 |
|
256 | 0 | for(size_t r = 0; r != 2; ++r) |
257 | 0 | { |
258 | 0 | copy_mem(T.data(), B.data(), B.size()); |
259 | 0 |
|
260 | 0 | for(size_t i = 0; i != 128; i += 16) |
261 | 0 | { |
262 | 0 | blamka(T[i+ 0], T[i+ 1], T[i+ 2], T[i+ 3], |
263 | 0 | T[i+ 4], T[i+ 5], T[i+ 6], T[i+ 7], |
264 | 0 | T[i+ 8], T[i+ 9], T[i+10], T[i+11], |
265 | 0 | T[i+12], T[i+13], T[i+14], T[i+15]); |
266 | 0 | } |
267 | 0 | for(size_t i = 0; i != 128 / 8; i += 2) |
268 | 0 | { |
269 | 0 | blamka(T[ i], T[ i+1], T[ 16+i], T[ 16+i+1], |
270 | 0 | T[ 32+i], T[ 32+i+1], T[ 48+i], T[ 48+i+1], |
271 | 0 | T[ 64+i], T[ 64+i+1], T[ 80+i], T[ 80+i+1], |
272 | 0 | T[ 96+i], T[ 96+i+1], T[112+i], T[112+i+1]); |
273 | 0 | } |
274 | 0 |
|
275 | 0 | for(size_t i = 0; i != 128; ++i) |
276 | 0 | B[i] ^= T[i]; |
277 | 0 | } |
278 | 0 | } |
279 | | |
280 | | uint32_t index_alpha(uint64_t random, |
281 | | size_t lanes, |
282 | | size_t segments, |
283 | | size_t threads, |
284 | | size_t n, |
285 | | size_t slice, |
286 | | size_t lane, |
287 | | size_t index) |
288 | 0 | { |
289 | 0 | size_t ref_lane = static_cast<uint32_t>(random >> 32) % threads; |
290 | 0 |
|
291 | 0 | if(n == 0 && slice == 0) |
292 | 0 | ref_lane = lane; |
293 | 0 |
|
294 | 0 | size_t m = 3*segments; |
295 | 0 | size_t s = ((slice+1) % 4)*segments; |
296 | 0 |
|
297 | 0 | if(lane == ref_lane) |
298 | 0 | m += index; |
299 | 0 |
|
300 | 0 | if(n == 0) { |
301 | 0 | m = slice*segments; |
302 | 0 | s = 0; |
303 | 0 | if(slice == 0 || lane == ref_lane) |
304 | 0 | m += index; |
305 | 0 | } |
306 | 0 |
|
307 | 0 | if(index == 0 || lane == ref_lane) |
308 | 0 | m -= 1; |
309 | 0 |
|
310 | 0 | uint64_t p = static_cast<uint32_t>(random); |
311 | 0 | p = (p * p) >> 32; |
312 | 0 | p = (p * m) >> 32; |
313 | 0 |
|
314 | 0 | return static_cast<uint32_t>(ref_lane*lanes + (s + m - (p+1)) % lanes); |
315 | 0 | } |
316 | | |
317 | | void process_block_argon2d(secure_vector<uint64_t>& T, |
318 | | secure_vector<uint64_t>& B, |
319 | | size_t n, size_t slice, size_t lane, |
320 | | size_t lanes, size_t segments, size_t threads) |
321 | 0 | { |
322 | 0 | size_t index = 0; |
323 | 0 | if(n == 0 && slice == 0) |
324 | 0 | index = 2; |
325 | 0 |
|
326 | 0 | while(index < segments) |
327 | 0 | { |
328 | 0 | const size_t offset = lane*lanes + slice*segments + index; |
329 | 0 |
|
330 | 0 | size_t prev = offset - 1; |
331 | 0 | if(index == 0 && slice == 0) |
332 | 0 | prev += lanes; |
333 | 0 |
|
334 | 0 | const uint64_t random = B.at(128*prev); |
335 | 0 | const size_t new_offset = index_alpha(random, lanes, segments, threads, n, slice, lane, index); |
336 | 0 |
|
337 | 0 | process_block_xor(T, B, offset, prev, new_offset); |
338 | 0 |
|
339 | 0 | index += 1; |
340 | 0 | } |
341 | 0 | } |
342 | | |
343 | | void process_block_argon2i(secure_vector<uint64_t>& T, |
344 | | secure_vector<uint64_t>& B, |
345 | | size_t n, size_t slice, size_t lane, |
346 | | size_t lanes, size_t segments, size_t threads, uint8_t mode, |
347 | | size_t memory, size_t time) |
348 | 0 | { |
349 | 0 | size_t index = 0; |
350 | 0 | if(n == 0 && slice == 0) |
351 | 0 | index = 2; |
352 | 0 |
|
353 | 0 | secure_vector<uint64_t> addresses(128); |
354 | 0 | size_t address_counter = 1; |
355 | 0 |
|
356 | 0 | gen_2i_addresses(T, addresses, n, lane, slice, memory, time, mode, address_counter); |
357 | 0 |
|
358 | 0 | while(index < segments) |
359 | 0 | { |
360 | 0 | const size_t offset = lane*lanes + slice*segments + index; |
361 | 0 |
|
362 | 0 | size_t prev = offset - 1; |
363 | 0 | if(index == 0 && slice == 0) |
364 | 0 | prev += lanes; |
365 | 0 |
|
366 | 0 | if(index > 0 && index % 128 == 0) |
367 | 0 | { |
368 | 0 | address_counter += 1; |
369 | 0 | gen_2i_addresses(T, addresses, n, lane, slice, memory, time, mode, address_counter); |
370 | 0 | } |
371 | 0 |
|
372 | 0 | const uint64_t random = addresses[index % 128]; |
373 | 0 | const size_t new_offset = index_alpha(random, lanes, segments, threads, n, slice, lane, index); |
374 | 0 |
|
375 | 0 | process_block_xor(T, B, offset, prev, new_offset); |
376 | 0 |
|
377 | 0 | index += 1; |
378 | 0 | } |
379 | 0 | } |
380 | | |
381 | | void process_blocks(secure_vector<uint64_t>& B, |
382 | | size_t t, |
383 | | size_t memory, |
384 | | size_t threads, |
385 | | uint8_t mode) |
386 | 0 | { |
387 | 0 | const size_t lanes = memory / threads; |
388 | 0 | const size_t segments = lanes / SYNC_POINTS; |
389 | 0 |
|
390 | 0 | secure_vector<uint64_t> T(128); |
391 | 0 | for(size_t n = 0; n != t; ++n) |
392 | 0 | { |
393 | 0 | for(size_t slice = 0; slice != SYNC_POINTS; ++slice) |
394 | 0 | { |
395 | 0 | // TODO can run this in Thread_Pool |
396 | 0 | for(size_t lane = 0; lane != threads; ++lane) |
397 | 0 | { |
398 | 0 | if(mode == 1 || (mode == 2 && n == 0 && slice < SYNC_POINTS/2)) |
399 | 0 | process_block_argon2i(T, B, n, slice, lane, lanes, segments, threads, mode, memory, t); |
400 | 0 | else |
401 | 0 | process_block_argon2d(T, B, n, slice, lane, lanes, segments, threads); |
402 | 0 | } |
403 | 0 | } |
404 | 0 | } |
405 | 0 |
|
406 | 0 | } |
407 | | |
408 | | } |
409 | | |
410 | | void argon2(uint8_t output[], size_t output_len, |
411 | | const char* password, size_t password_len, |
412 | | const uint8_t salt[], size_t salt_len, |
413 | | const uint8_t key[], size_t key_len, |
414 | | const uint8_t ad[], size_t ad_len, |
415 | | uint8_t mode, size_t threads, size_t M, size_t t) |
416 | 0 | { |
417 | 0 | BOTAN_ARG_CHECK(mode == 0 || mode == 1 || mode == 2, "Unknown Argon2 mode parameter"); |
418 | 0 | BOTAN_ARG_CHECK(output_len >= 4, "Invalid Argon2 output length"); |
419 | 0 | BOTAN_ARG_CHECK(threads >= 1 && threads <= 128, "Invalid Argon2 threads parameter"); |
420 | 0 | BOTAN_ARG_CHECK(M >= 8*threads && M <= 8192*1024, "Invalid Argon2 M parameter"); |
421 | 0 | BOTAN_ARG_CHECK(t >= 1, "Invalid Argon2 t parameter"); |
422 | 0 |
|
423 | 0 | std::unique_ptr<HashFunction> blake2 = HashFunction::create_or_throw("BLAKE2b"); |
424 | 0 |
|
425 | 0 | const auto H0 = argon2_H0(*blake2, output_len, |
426 | 0 | password, password_len, |
427 | 0 | salt, salt_len, |
428 | 0 | key, key_len, |
429 | 0 | ad, ad_len, |
430 | 0 | mode, threads, M, t); |
431 | 0 |
|
432 | 0 | const size_t memory = (M / (SYNC_POINTS*threads)) * (SYNC_POINTS*threads); |
433 | 0 |
|
434 | 0 | secure_vector<uint64_t> B(memory * 1024/8); |
435 | 0 |
|
436 | 0 | init_blocks(B, *blake2, H0, memory, threads); |
437 | 0 | process_blocks(B, t, memory, threads, mode); |
438 | 0 |
|
439 | 0 | clear_mem(output, output_len); |
440 | 0 | extract_key(output, output_len, B, memory, threads); |
441 | 0 | } |
442 | | |
443 | | } |