/src/libsodium/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-sse41.c
Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | #define BLAKE2_USE_SSSE3 |
3 | | #define BLAKE2_USE_SSE41 |
4 | | |
5 | | #include <stdint.h> |
6 | | #include <string.h> |
7 | | |
8 | | #include "blake2.h" |
9 | | #include "private/common.h" |
10 | | |
11 | | #if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \ |
12 | | defined(HAVE_SMMINTRIN_H) |
13 | | |
14 | | # ifdef __clang__ |
15 | | # pragma clang attribute push(__attribute__((target("sse2,ssse3,sse4.1"))), apply_to = function) |
16 | | # elif defined(__GNUC__) |
17 | | # pragma GCC target("sse2,ssse3,sse4.1") |
18 | | # endif |
19 | | |
20 | | # include <emmintrin.h> |
21 | | # include <smmintrin.h> |
22 | | # include <tmmintrin.h> |
23 | | # include "private/sse2_64_32.h" |
24 | | |
25 | | # include "blake2b-compress-sse41.h" |
26 | | |
27 | | CRYPTO_ALIGN(64) |
28 | | static const uint64_t blake2b_IV[8] = { |
29 | | 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, |
30 | | 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, |
31 | | 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL |
32 | | }; |
33 | | |
34 | | int |
35 | | blake2b_compress_sse41(blake2b_state *S, |
36 | | const uint8_t block[BLAKE2B_BLOCKBYTES]) |
37 | 0 | { |
38 | 0 | __m128i row1l, row1h; |
39 | 0 | __m128i row2l, row2h; |
40 | 0 | __m128i row3l, row3h; |
41 | 0 | __m128i row4l, row4h; |
42 | 0 | __m128i b0, b1; |
43 | 0 | __m128i t0, t1; |
44 | 0 | const __m128i r16 = |
45 | 0 | _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); |
46 | 0 | const __m128i r24 = |
47 | 0 | _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); |
48 | 0 | const __m128i m0 = LOADU(block + 00); |
49 | 0 | const __m128i m1 = LOADU(block + 16); |
50 | 0 | const __m128i m2 = LOADU(block + 32); |
51 | 0 | const __m128i m3 = LOADU(block + 48); |
52 | 0 | const __m128i m4 = LOADU(block + 64); |
53 | 0 | const __m128i m5 = LOADU(block + 80); |
54 | 0 | const __m128i m6 = LOADU(block + 96); |
55 | 0 | const __m128i m7 = LOADU(block + 112); |
56 | 0 | row1l = LOADU(&S->h[0]); |
57 | 0 | row1h = LOADU(&S->h[2]); |
58 | 0 | row2l = LOADU(&S->h[4]); |
59 | 0 | row2h = LOADU(&S->h[6]); |
60 | 0 | row3l = LOADU(&blake2b_IV[0]); |
61 | 0 | row3h = LOADU(&blake2b_IV[2]); |
62 | 0 | row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); |
63 | 0 | row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); |
64 | 0 | ROUND(0); |
65 | 0 | ROUND(1); |
66 | 0 | ROUND(2); |
67 | 0 | ROUND(3); |
68 | 0 | ROUND(4); |
69 | 0 | ROUND(5); |
70 | 0 | ROUND(6); |
71 | 0 | ROUND(7); |
72 | 0 | ROUND(8); |
73 | 0 | ROUND(9); |
74 | 0 | ROUND(10); |
75 | 0 | ROUND(11); |
76 | 0 | row1l = _mm_xor_si128(row3l, row1l); |
77 | 0 | row1h = _mm_xor_si128(row3h, row1h); |
78 | 0 | STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); |
79 | 0 | STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); |
80 | 0 | row2l = _mm_xor_si128(row4l, row2l); |
81 | 0 | row2h = _mm_xor_si128(row4h, row2h); |
82 | 0 | STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); |
83 | 0 | STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); |
84 | 0 | return 0; |
85 | 0 | } |
86 | | |
87 | | # ifdef __clang__ |
88 | | # pragma clang attribute pop |
89 | | # endif |
90 | | |
91 | | #endif |