/src/BearSSL/src/symcipher/chacha20_sse2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> |
3 | | * |
4 | | * Permission is hereby granted, free of charge, to any person obtaining |
5 | | * a copy of this software and associated documentation files (the |
6 | | * "Software"), to deal in the Software without restriction, including |
7 | | * without limitation the rights to use, copy, modify, merge, publish, |
8 | | * distribute, sublicense, and/or sell copies of the Software, and to |
9 | | * permit persons to whom the Software is furnished to do so, subject to |
10 | | * the following conditions: |
11 | | * |
12 | | * The above copyright notice and this permission notice shall be |
13 | | * included in all copies or substantial portions of the Software. |
14 | | * |
15 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
16 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
17 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
18 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
19 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
20 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
21 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | |
25 | | #define BR_ENABLE_INTRINSICS 1 |
26 | | #include "inner.h" |
27 | | |
28 | | #if BR_SSE2 |
29 | | |
30 | | /* |
31 | | * This file contains a ChaCha20 implementation that leverages SSE2 |
32 | | * opcodes for better performance. |
33 | | */ |
34 | | |
35 | | /* see bearssl_block.h */ |
36 | | br_chacha20_run |
37 | | br_chacha20_sse2_get(void) |
38 | 177 | { |
39 | | /* |
40 | | * If using 64-bit mode, then SSE2 opcodes should be automatically |
41 | | * available, since they are part of the ABI. |
42 | | * |
43 | | * In 32-bit mode, we use CPUID to detect the SSE2 feature. |
44 | | */ |
45 | | |
46 | 177 | #if BR_amd64 |
47 | 177 | return &br_chacha20_sse2_run; |
48 | | #else |
49 | | |
50 | | /* |
51 | | * SSE2 support is indicated by bit 26 in EDX. |
52 | | */ |
53 | | if (br_cpuid(0, 0, 0, 0x04000000)) { |
54 | | return &br_chacha20_sse2_run; |
55 | | } else { |
56 | | return 0; |
57 | | } |
58 | | #endif |
59 | 177 | } |
60 | | |
61 | | BR_TARGETS_X86_UP |
62 | | |
63 | | /* see bearssl_block.h */ |
64 | | BR_TARGET("sse2") |
65 | | uint32_t |
66 | | br_chacha20_sse2_run(const void *key, |
67 | | const void *iv, uint32_t cc, void *data, size_t len) |
68 | 244 | { |
69 | 244 | unsigned char *buf; |
70 | 244 | uint32_t ivtmp[4]; |
71 | 244 | __m128i kw0, kw1; |
72 | 244 | __m128i iw, cw; |
73 | 244 | __m128i one; |
74 | | |
75 | 244 | static const uint32_t CW[] = { |
76 | 244 | 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 |
77 | 244 | }; |
78 | | |
79 | 244 | buf = data; |
80 | 244 | kw0 = _mm_loadu_si128(key); |
81 | 244 | kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); |
82 | 244 | ivtmp[0] = cc; |
83 | 244 | memcpy(ivtmp + 1, iv, 12); |
84 | 244 | iw = _mm_loadu_si128((const void *)ivtmp); |
85 | 244 | cw = _mm_loadu_si128((const void *)CW); |
86 | 244 | one = _mm_set_epi32(0, 0, 0, 1); |
87 | | |
88 | 1.61k | while (len > 0) { |
89 | | /* |
90 | | * sj contains state words 4*j to 4*j+3. |
91 | | */ |
92 | 1.52k | __m128i s0, s1, s2, s3; |
93 | 1.52k | int i; |
94 | | |
95 | 1.52k | s0 = cw; |
96 | 1.52k | s1 = kw0; |
97 | 1.52k | s2 = kw1; |
98 | 1.52k | s3 = iw; |
99 | 16.8k | for (i = 0; i < 10; i ++) { |
100 | | /* |
101 | | * Even round is straightforward application on |
102 | | * the state words. |
103 | | */ |
104 | 15.2k | s0 = _mm_add_epi32(s0, s1); |
105 | 15.2k | s3 = _mm_xor_si128(s3, s0); |
106 | 15.2k | s3 = _mm_or_si128( |
107 | 15.2k | _mm_slli_epi32(s3, 16), |
108 | 15.2k | _mm_srli_epi32(s3, 16)); |
109 | | |
110 | 15.2k | s2 = _mm_add_epi32(s2, s3); |
111 | 15.2k | s1 = _mm_xor_si128(s1, s2); |
112 | 15.2k | s1 = _mm_or_si128( |
113 | 15.2k | _mm_slli_epi32(s1, 12), |
114 | 15.2k | _mm_srli_epi32(s1, 20)); |
115 | | |
116 | 15.2k | s0 = _mm_add_epi32(s0, s1); |
117 | 15.2k | s3 = _mm_xor_si128(s3, s0); |
118 | 15.2k | s3 = _mm_or_si128( |
119 | 15.2k | _mm_slli_epi32(s3, 8), |
120 | 15.2k | _mm_srli_epi32(s3, 24)); |
121 | | |
122 | 15.2k | s2 = _mm_add_epi32(s2, s3); |
123 | 15.2k | s1 = _mm_xor_si128(s1, s2); |
124 | 15.2k | s1 = _mm_or_si128( |
125 | 15.2k | _mm_slli_epi32(s1, 7), |
126 | 15.2k | _mm_srli_epi32(s1, 25)); |
127 | | |
128 | | /* |
129 | | * For the odd round, we must rotate some state |
130 | | * words so that the computations apply on the |
131 | | * right combinations of words. |
132 | | */ |
133 | 15.2k | s1 = _mm_shuffle_epi32(s1, 0x39); |
134 | 15.2k | s2 = _mm_shuffle_epi32(s2, 0x4E); |
135 | 15.2k | s3 = _mm_shuffle_epi32(s3, 0x93); |
136 | | |
137 | 15.2k | s0 = _mm_add_epi32(s0, s1); |
138 | 15.2k | s3 = _mm_xor_si128(s3, s0); |
139 | 15.2k | s3 = _mm_or_si128( |
140 | 15.2k | _mm_slli_epi32(s3, 16), |
141 | 15.2k | _mm_srli_epi32(s3, 16)); |
142 | | |
143 | 15.2k | s2 = _mm_add_epi32(s2, s3); |
144 | 15.2k | s1 = _mm_xor_si128(s1, s2); |
145 | 15.2k | s1 = _mm_or_si128( |
146 | 15.2k | _mm_slli_epi32(s1, 12), |
147 | 15.2k | _mm_srli_epi32(s1, 20)); |
148 | | |
149 | 15.2k | s0 = _mm_add_epi32(s0, s1); |
150 | 15.2k | s3 = _mm_xor_si128(s3, s0); |
151 | 15.2k | s3 = _mm_or_si128( |
152 | 15.2k | _mm_slli_epi32(s3, 8), |
153 | 15.2k | _mm_srli_epi32(s3, 24)); |
154 | | |
155 | 15.2k | s2 = _mm_add_epi32(s2, s3); |
156 | 15.2k | s1 = _mm_xor_si128(s1, s2); |
157 | 15.2k | s1 = _mm_or_si128( |
158 | 15.2k | _mm_slli_epi32(s1, 7), |
159 | 15.2k | _mm_srli_epi32(s1, 25)); |
160 | | |
161 | | /* |
162 | | * After the odd round, we rotate back the values |
163 | | * to undo the rotate at the start of the odd round. |
164 | | */ |
165 | 15.2k | s1 = _mm_shuffle_epi32(s1, 0x93); |
166 | 15.2k | s2 = _mm_shuffle_epi32(s2, 0x4E); |
167 | 15.2k | s3 = _mm_shuffle_epi32(s3, 0x39); |
168 | 15.2k | } |
169 | | |
170 | | /* |
171 | | * Addition with the initial state. |
172 | | */ |
173 | 1.52k | s0 = _mm_add_epi32(s0, cw); |
174 | 1.52k | s1 = _mm_add_epi32(s1, kw0); |
175 | 1.52k | s2 = _mm_add_epi32(s2, kw1); |
176 | 1.52k | s3 = _mm_add_epi32(s3, iw); |
177 | | |
178 | | /* |
179 | | * Increment block counter. |
180 | | */ |
181 | 1.52k | iw = _mm_add_epi32(iw, one); |
182 | | |
183 | | /* |
184 | | * XOR final state with the data. |
185 | | */ |
186 | 1.52k | if (len < 64) { |
187 | 158 | unsigned char tmp[64]; |
188 | 158 | size_t u; |
189 | | |
190 | 158 | _mm_storeu_si128((void *)(tmp + 0), s0); |
191 | 158 | _mm_storeu_si128((void *)(tmp + 16), s1); |
192 | 158 | _mm_storeu_si128((void *)(tmp + 32), s2); |
193 | 158 | _mm_storeu_si128((void *)(tmp + 48), s3); |
194 | 3.58k | for (u = 0; u < len; u ++) { |
195 | 3.42k | buf[u] ^= tmp[u]; |
196 | 3.42k | } |
197 | 158 | break; |
198 | 1.37k | } else { |
199 | 1.37k | __m128i b0, b1, b2, b3; |
200 | | |
201 | 1.37k | b0 = _mm_loadu_si128((const void *)(buf + 0)); |
202 | 1.37k | b1 = _mm_loadu_si128((const void *)(buf + 16)); |
203 | 1.37k | b2 = _mm_loadu_si128((const void *)(buf + 32)); |
204 | 1.37k | b3 = _mm_loadu_si128((const void *)(buf + 48)); |
205 | 1.37k | b0 = _mm_xor_si128(b0, s0); |
206 | 1.37k | b1 = _mm_xor_si128(b1, s1); |
207 | 1.37k | b2 = _mm_xor_si128(b2, s2); |
208 | 1.37k | b3 = _mm_xor_si128(b3, s3); |
209 | 1.37k | _mm_storeu_si128((void *)(buf + 0), b0); |
210 | 1.37k | _mm_storeu_si128((void *)(buf + 16), b1); |
211 | 1.37k | _mm_storeu_si128((void *)(buf + 32), b2); |
212 | 1.37k | _mm_storeu_si128((void *)(buf + 48), b3); |
213 | 1.37k | buf += 64; |
214 | 1.37k | len -= 64; |
215 | 1.37k | } |
216 | 1.52k | } |
217 | | |
218 | | /* |
219 | | * _mm_extract_epi32() requires SSE4.1. We prefer to stick to |
220 | | * raw SSE2, thus we use _mm_extract_epi16(). |
221 | | */ |
222 | 244 | return (uint32_t)_mm_extract_epi16(iw, 0) |
223 | | | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); |
224 | 244 | } |
225 | | |
226 | | BR_TARGETS_X86_DOWN |
227 | | |
228 | | #else |
229 | | |
230 | | /* see bearssl_block.h */ |
231 | | br_chacha20_run |
232 | | br_chacha20_sse2_get(void) |
233 | | { |
234 | | return 0; |
235 | | } |
236 | | |
237 | | #endif |