/src/botan/src/lib/hash/sha2_32/sha2_32_bmi2/sha2_32_bmi2.cpp
Line | Count | Source |
1 | | /* |
2 | | * (C) 2018 Jack Lloyd |
3 | | * |
4 | | * Botan is released under the Simplified BSD License (see license.txt) |
5 | | */ |
6 | | |
7 | | #include <botan/internal/sha2_32.h> |
8 | | #include <botan/internal/loadstor.h> |
9 | | #include <botan/internal/rotate.h> |
10 | | #include <botan/internal/bit_ops.h> |
11 | | |
12 | | namespace Botan { |
13 | | |
14 | | /* |
15 | | Your eyes do not decieve you; this is currently just a copy of the |
16 | | baseline SHA-256 implementation. Because we compile it with BMI2 |
17 | | flags, GCC and Clang use the BMI2 instructions without further help. |
18 | | |
19 | | Likely instruction scheduling could be improved by using inline asm. |
20 | | */ |
21 | | |
22 | 58.9M | #define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic) do { \ |
23 | 58.9M | uint32_t A_rho = rotr<2>(A) ^ rotr<13>(A) ^ rotr<22>(A); \ |
24 | 58.9M | uint32_t E_rho = rotr<6>(E) ^ rotr<11>(E) ^ rotr<25>(E); \ |
25 | 58.9M | uint32_t M2_sigma = rotr<17>(M2) ^ rotr<19>(M2) ^ (M2 >> 10); \ |
26 | 58.9M | uint32_t M4_sigma = rotr<7>(M4) ^ rotr<18>(M4) ^ (M4 >> 3); \ |
27 | 58.9M | H += magic + E_rho + choose(E, F, G) + M1; \ |
28 | 58.9M | D += H; \ |
29 | 58.9M | H += A_rho + majority(A, B, C); \ |
30 | 58.9M | M1 += M2_sigma + M3 + M4_sigma; \ |
31 | 58.9M | } while(0); |
32 | | |
33 | | void SHA_256::compress_digest_x86_bmi2(secure_vector<uint32_t>& digest, |
34 | | const uint8_t input[], |
35 | | size_t blocks) |
36 | 749k | { |
37 | 749k | uint32_t A = digest[0], B = digest[1], C = digest[2], |
38 | 749k | D = digest[3], E = digest[4], F = digest[5], |
39 | 749k | G = digest[6], H = digest[7]; |
40 | | |
41 | 1.67M | for(size_t i = 0; i != blocks; ++i) |
42 | 920k | { |
43 | 920k | uint32_t W00 = load_be<uint32_t>(input, 0); |
44 | 920k | uint32_t W01 = load_be<uint32_t>(input, 1); |
45 | 920k | uint32_t W02 = load_be<uint32_t>(input, 2); |
46 | 920k | uint32_t W03 = load_be<uint32_t>(input, 3); |
47 | 920k | uint32_t W04 = load_be<uint32_t>(input, 4); |
48 | 920k | uint32_t W05 = load_be<uint32_t>(input, 5); |
49 | 920k | uint32_t W06 = load_be<uint32_t>(input, 6); |
50 | 920k | uint32_t W07 = load_be<uint32_t>(input, 7); |
51 | 920k | uint32_t W08 = load_be<uint32_t>(input, 8); |
52 | 920k | uint32_t W09 = load_be<uint32_t>(input, 9); |
53 | 920k | uint32_t W10 = load_be<uint32_t>(input, 10); |
54 | 920k | uint32_t W11 = load_be<uint32_t>(input, 11); |
55 | 920k | uint32_t W12 = load_be<uint32_t>(input, 12); |
56 | 920k | uint32_t W13 = load_be<uint32_t>(input, 13); |
57 | 920k | uint32_t W14 = load_be<uint32_t>(input, 14); |
58 | 920k | uint32_t W15 = load_be<uint32_t>(input, 15); |
59 | | |
60 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x428A2F98); |
61 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x71374491); |
62 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0xB5C0FBCF); |
63 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0xE9B5DBA5); |
64 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x3956C25B); |
65 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x59F111F1); |
66 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x923F82A4); |
67 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0xAB1C5ED5); |
68 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xD807AA98); |
69 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x12835B01); |
70 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x243185BE); |
71 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x550C7DC3); |
72 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x72BE5D74); |
73 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0x80DEB1FE); |
74 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x9BDC06A7); |
75 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC19BF174); |
76 | | |
77 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0xE49B69C1); |
78 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0xEFBE4786); |
79 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x0FC19DC6); |
80 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x240CA1CC); |
81 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x2DE92C6F); |
82 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4A7484AA); |
83 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5CB0A9DC); |
84 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x76F988DA); |
85 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x983E5152); |
86 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA831C66D); |
87 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xB00327C8); |
88 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xBF597FC7); |
89 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xC6E00BF3); |
90 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD5A79147); |
91 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x06CA6351); |
92 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x14292967); |
93 | | |
94 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x27B70A85); |
95 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x2E1B2138); |
96 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x4D2C6DFC); |
97 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x53380D13); |
98 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x650A7354); |
99 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x766A0ABB); |
100 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x81C2C92E); |
101 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x92722C85); |
102 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xA2BFE8A1); |
103 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA81A664B); |
104 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xC24B8B70); |
105 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xC76C51A3); |
106 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xD192E819); |
107 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD6990624); |
108 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xF40E3585); |
109 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x106AA070); |
110 | | |
111 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x19A4C116); |
112 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x1E376C08); |
113 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x2748774C); |
114 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x34B0BCB5); |
115 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x391C0CB3); |
116 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4ED8AA4A); |
117 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5B9CCA4F); |
118 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x682E6FF3); |
119 | 920k | SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x748F82EE); |
120 | 920k | SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x78A5636F); |
121 | 920k | SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x84C87814); |
122 | 920k | SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x8CC70208); |
123 | 920k | SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x90BEFFFA); |
124 | 920k | SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xA4506CEB); |
125 | 920k | SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xBEF9A3F7); |
126 | 920k | SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC67178F2); |
127 | | |
128 | 920k | A = (digest[0] += A); |
129 | 920k | B = (digest[1] += B); |
130 | 920k | C = (digest[2] += C); |
131 | 920k | D = (digest[3] += D); |
132 | 920k | E = (digest[4] += E); |
133 | 920k | F = (digest[5] += F); |
134 | 920k | G = (digest[6] += G); |
135 | 920k | H = (digest[7] += H); |
136 | | |
137 | 920k | input += 64; |
138 | 920k | } |
139 | 749k | } |
140 | | |
141 | | } |