/src/xz/src/liblzma/check/crc_x86_clmul.h
Line | Count | Source |
1 | | // SPDX-License-Identifier: 0BSD |
2 | | |
3 | | /////////////////////////////////////////////////////////////////////////////// |
4 | | // |
5 | | /// \file crc_x86_clmul.h |
6 | | /// \brief CRC32 and CRC64 implementations using CLMUL instructions. |
7 | | /// |
8 | | /// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and |
9 | | /// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too. |
10 | | /// |
11 | | /// See the Intel white paper "Fast CRC Computation for Generic Polynomials |
12 | | /// Using PCLMULQDQ Instruction" from 2009. The original file seems to be |
13 | | /// gone from Intel's website but a version is available here: |
14 | | /// https://www.researchgate.net/publication/263424619_Fast_CRC_computation |
15 | | /// (The link was checked on 2024-06-11.) |
16 | | /// |
17 | | /// While this file has both CRC32 and CRC64 implementations, only one |
18 | | /// can be built at a time. The version to build is selected by defining |
19 | | /// BUILDING_CRC_CLMUL to 32 or 64 before including this file. |
20 | | /// |
21 | | /// NOTE: The x86 CLMUL CRC implementation was rewritten for XZ Utils 5.8.0. |
22 | | // |
23 | | // Authors: Lasse Collin |
24 | | // Ilya Kurdyukov |
25 | | // |
26 | | /////////////////////////////////////////////////////////////////////////////// |
27 | | |
28 | | // This file must not be included more than once. |
29 | | #ifdef LZMA_CRC_X86_CLMUL_H |
30 | | # error crc_x86_clmul.h was included twice. |
31 | | #endif |
32 | | #define LZMA_CRC_X86_CLMUL_H |
33 | | |
34 | | #if BUILDING_CRC_CLMUL != 32 && BUILDING_CRC_CLMUL != 64 |
35 | | # error BUILDING_CRC_CLMUL is undefined or has an invalid value |
36 | | #endif |
37 | | |
38 | | #include <immintrin.h> |
39 | | |
40 | | #if defined(_MSC_VER) |
41 | | # include <intrin.h> |
42 | | #elif defined(HAVE_CPUID_H) |
43 | | # include <cpuid.h> |
44 | | #endif |
45 | | |
46 | | |
47 | | // EDG-based compilers (Intel's classic compiler and compiler for E2K) can |
48 | | // define __GNUC__ but the attribute must not be used with them. |
49 | | // The new Clang-based ICX needs the attribute. |
50 | | // |
51 | | // NOTE: Build systems check for this too, keep them in sync with this. |
52 | | #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) |
53 | | # define crc_attr_target \ |
54 | | __attribute__((__target__("ssse3,sse4.1,pclmul"))) |
55 | | #else |
56 | | # define crc_attr_target |
57 | | #endif |
58 | | |
59 | | |
60 | | // GCC and Clang would produce good code with _mm_set_epi64x |
61 | | // but MSVC needs _mm_cvtsi64_si128 on x86-64. |
62 | | #if defined(__i386__) || defined(_M_IX86) |
63 | | # define my_set_low64(a) _mm_set_epi64x(0, (a)) |
64 | | #else |
65 | 1.02M | # define my_set_low64(a) _mm_cvtsi64_si128(a) |
66 | | #endif |
67 | | |
68 | | |
69 | | // Align it so that the whole array is within the same cache line. |
70 | | // More than one unaligned load can be done from this during the |
71 | | // same CRC function call. |
72 | | // |
73 | | // The bytes [0] to [31] are used with AND to clear the low bytes. (With ANDN |
74 | | // those could be used to clear the high bytes too but it's not needed here.) |
75 | | // |
76 | | // The bytes [16] to [47] are for left shifts. |
77 | | // The bytes [32] to [63] are for right shifts. |
78 | | alignas(64) |
79 | | static uint8_t vmasks[64] = { |
80 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
81 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
82 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
83 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
84 | | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
85 | | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
86 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
87 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
88 | | }; |
89 | | |
90 | | |
91 | | // *Unaligned* 128-bit load |
92 | | crc_attr_target |
93 | | static inline __m128i |
94 | | my_load128(const uint8_t *p) |
95 | 482k | { |
96 | 482k | return _mm_loadu_si128((const __m128i *)p); |
97 | 482k | } Line | Count | Source | 95 | 358k | { | 96 | 358k | return _mm_loadu_si128((const __m128i *)p); | 97 | 358k | } |
Line | Count | Source | 95 | 123k | { | 96 | 123k | return _mm_loadu_si128((const __m128i *)p); | 97 | 123k | } |
|
98 | | |
99 | | |
100 | | // Keep the highest "count" bytes as is and clear the remaining low bytes. |
101 | | crc_attr_target |
102 | | static inline __m128i |
103 | | keep_high_bytes(__m128i v, size_t count) |
104 | 4.64k | { |
105 | 4.64k | return _mm_and_si128(my_load128((vmasks + count)), v); |
106 | 4.64k | } crc32_fast.c:keep_high_bytes Line | Count | Source | 104 | 876 | { | 105 | 876 | return _mm_and_si128(my_load128((vmasks + count)), v); | 106 | 876 | } |
crc64_fast.c:keep_high_bytes Line | Count | Source | 104 | 3.77k | { | 105 | 3.77k | return _mm_and_si128(my_load128((vmasks + count)), v); | 106 | 3.77k | } |
|
107 | | |
108 | | |
109 | | // Shift the 128-bit value left by "amount" bytes (not bits). |
110 | | crc_attr_target |
111 | | static inline __m128i |
112 | | shift_left(__m128i v, size_t amount) |
113 | 355k | { |
114 | 355k | return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount))); |
115 | 355k | } Line | Count | Source | 113 | 347k | { | 114 | 347k | return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount))); | 115 | 347k | } |
Line | Count | Source | 113 | 7.60k | { | 114 | 7.60k | return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount))); | 115 | 7.60k | } |
|
116 | | |
117 | | |
118 | | // Shift the 128-bit value right by "amount" bytes (not bits). |
119 | | crc_attr_target |
120 | | static inline __m128i |
121 | | shift_right(__m128i v, size_t amount) |
122 | 4.64k | { |
123 | 4.64k | return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount))); |
124 | 4.64k | } Line | Count | Source | 122 | 876 | { | 123 | 876 | return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount))); | 124 | 876 | } |
Line | Count | Source | 122 | 3.77k | { | 123 | 3.77k | return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount))); | 124 | 3.77k | } |
|
125 | | |
126 | | |
127 | | crc_attr_target |
128 | | static inline __m128i |
129 | | fold(__m128i v, __m128i k) |
130 | 109k | { |
131 | 109k | __m128i a = _mm_clmulepi64_si128(v, k, 0x00); |
132 | 109k | __m128i b = _mm_clmulepi64_si128(v, k, 0x11); |
133 | 109k | return _mm_xor_si128(a, b); |
134 | 109k | } Line | Count | Source | 130 | 5.44k | { | 131 | 5.44k | __m128i a = _mm_clmulepi64_si128(v, k, 0x00); | 132 | 5.44k | __m128i b = _mm_clmulepi64_si128(v, k, 0x11); | 133 | 5.44k | return _mm_xor_si128(a, b); | 134 | 5.44k | } |
Line | Count | Source | 130 | 104k | { | 131 | 104k | __m128i a = _mm_clmulepi64_si128(v, k, 0x00); | 132 | 104k | __m128i b = _mm_clmulepi64_si128(v, k, 0x11); | 133 | 104k | return _mm_xor_si128(a, b); | 134 | 104k | } |
|
135 | | |
136 | | |
137 | | crc_attr_target |
138 | | static inline __m128i |
139 | | fold_xor(__m128i v, __m128i k, const uint8_t *buf) |
140 | 96.9k | { |
141 | 96.9k | return _mm_xor_si128(my_load128(buf), fold(v, k)); |
142 | 96.9k | } Line | Count | Source | 140 | 4.24k | { | 141 | 4.24k | return _mm_xor_si128(my_load128(buf), fold(v, k)); | 142 | 4.24k | } |
Line | Count | Source | 140 | 92.6k | { | 141 | 92.6k | return _mm_xor_si128(my_load128(buf), fold(v, k)); | 142 | 92.6k | } |
|
143 | | |
144 | | |
145 | | #if BUILDING_CRC_CLMUL == 32 |
146 | | crc_attr_target |
147 | | static uint32_t |
148 | | crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) |
149 | | #else |
150 | | crc_attr_target |
151 | | static uint64_t |
152 | | crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc) |
153 | | #endif |
154 | 1.02M | { |
155 | | // We will assume that there is at least one byte of input. |
156 | 1.02M | if (size == 0) |
157 | 110 | return crc; |
158 | | |
159 | | // See crc_clmul_consts_gen.c. |
160 | | #if BUILDING_CRC_CLMUL == 32 |
161 | 1.01M | const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95); |
162 | 1.01M | const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191); |
163 | 1.01M | const __m128i mu_p = _mm_set_epi64x( |
164 | 1.01M | (int64_t)0xb4e5b025f7011641, 0x1db710640); |
165 | | #else |
166 | 8.37k | const __m128i fold512 = _mm_set_epi64x( |
167 | 8.37k | (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3); |
168 | | |
169 | | const __m128i fold128 = _mm_set_epi64x( |
170 | | (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4); |
171 | | |
172 | 8.37k | const __m128i mu_p = _mm_set_epi64x( |
173 | 8.37k | (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84); |
174 | 8.37k | #endif |
175 | | |
176 | 8.37k | __m128i v0, v1, v2, v3; |
177 | | |
178 | 8.37k | crc = ~crc; |
179 | | |
180 | 1.02M | if (size < 8) { |
181 | 342k | uint64_t x = crc; |
182 | 342k | size_t i = 0; |
183 | | |
184 | | // Checking the bit instead of comparing the size means |
185 | | // that we don't need to update the size between the steps. |
186 | 342k | if (size & 4) { |
187 | 218k | x ^= read32le(buf); |
188 | 218k | buf += 4; |
189 | 218k | i = 32; |
190 | 218k | } |
191 | | |
192 | 342k | if (size & 2) { |
193 | 236k | x ^= (uint64_t)read16le(buf) << i; |
194 | 236k | buf += 2; |
195 | 236k | i += 16; |
196 | 236k | } |
197 | | |
198 | 342k | if (size & 1) |
199 | 1.93k | x ^= (uint64_t)*buf << i; |
200 | | |
201 | 342k | v0 = my_set_low64((int64_t)x); |
202 | 342k | v0 = shift_left(v0, 8 - size); |
203 | | |
204 | 677k | } else if (size < 16) { |
205 | 669k | v0 = my_set_low64((int64_t)(crc ^ read64le(buf))); |
206 | | |
207 | | // NOTE: buf is intentionally left 8 bytes behind so that |
208 | | // we can read the last 1-7 bytes with read64le(buf + size). |
209 | 669k | size -= 8; |
210 | | |
211 | | // Handling 8-byte input specially is a speed optimization |
212 | | // as the clmul can be skipped. A branch is also needed to |
213 | | // avoid a too high shift amount. |
214 | 669k | if (size > 0) { |
215 | 8.04k | const size_t padding = 8 - size; |
216 | 8.04k | uint64_t high = read64le(buf + size) >> (padding * 8); |
217 | | |
218 | | #if defined(__i386__) || defined(_M_IX86) |
219 | | // Simple but likely not the best code for 32-bit x86. |
220 | | v0 = _mm_insert_epi32(v0, (int32_t)high, 2); |
221 | | v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3); |
222 | | #else |
223 | 8.04k | v0 = _mm_insert_epi64(v0, (int64_t)high, 1); |
224 | 8.04k | #endif |
225 | | |
226 | 8.04k | v0 = shift_left(v0, padding); |
227 | | |
228 | 8.04k | v1 = _mm_srli_si128(v0, 8); |
229 | 8.04k | v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); |
230 | 8.04k | v0 = _mm_xor_si128(v0, v1); |
231 | 8.04k | } |
232 | 669k | } else { |
233 | 8.00k | v0 = my_set_low64((int64_t)crc); |
234 | | |
235 | | // To align or not to align the buf pointer? If the end of |
236 | | // the buffer isn't aligned, aligning the pointer here would |
237 | | // make us do an extra folding step with the associated byte |
238 | | // shuffling overhead. The cost of that would need to be |
239 | | // lower than the benefit of aligned reads. Testing on an old |
240 | | // Intel Ivy Bridge processor suggested that aligning isn't |
241 | | // worth the cost but it likely depends on the processor and |
242 | | // buffer size. Unaligned loads (MOVDQU) should be fast on |
243 | | // x86 processors that support PCLMULQDQ, so we don't align |
244 | | // the buf pointer here. |
245 | | |
246 | | // Read the first (and possibly the only) full 16 bytes. |
247 | 8.00k | v0 = _mm_xor_si128(v0, my_load128(buf)); |
248 | 8.00k | buf += 16; |
249 | 8.00k | size -= 16; |
250 | | |
251 | 8.00k | if (size >= 48) { |
252 | 2.73k | v1 = my_load128(buf); |
253 | 2.73k | v2 = my_load128(buf + 16); |
254 | 2.73k | v3 = my_load128(buf + 32); |
255 | 2.73k | buf += 48; |
256 | 2.73k | size -= 48; |
257 | | |
258 | 25.3k | while (size >= 64) { |
259 | 22.5k | v0 = fold_xor(v0, fold512, buf); |
260 | 22.5k | v1 = fold_xor(v1, fold512, buf + 16); |
261 | 22.5k | v2 = fold_xor(v2, fold512, buf + 32); |
262 | 22.5k | v3 = fold_xor(v3, fold512, buf + 48); |
263 | 22.5k | buf += 64; |
264 | 22.5k | size -= 64; |
265 | 22.5k | } |
266 | | |
267 | 2.73k | v0 = _mm_xor_si128(v1, fold(v0, fold128)); |
268 | 2.73k | v0 = _mm_xor_si128(v2, fold(v0, fold128)); |
269 | 2.73k | v0 = _mm_xor_si128(v3, fold(v0, fold128)); |
270 | 2.73k | } |
271 | | |
272 | 14.6k | while (size >= 16) { |
273 | 6.60k | v0 = fold_xor(v0, fold128, buf); |
274 | 6.60k | buf += 16; |
275 | 6.60k | size -= 16; |
276 | 6.60k | } |
277 | | |
278 | 8.00k | if (size > 0) { |
279 | | // We want the last "size" number of input bytes to |
280 | | // be at the high bits of v1. First do a full 16-byte |
281 | | // load and then mask the low bytes to zeros. |
282 | 4.64k | v1 = my_load128(buf + size - 16); |
283 | 4.64k | v1 = keep_high_bytes(v1, size); |
284 | | |
285 | | // Shift high bytes from v0 to the low bytes of v1. |
286 | | // |
287 | | // Alternatively we could replace the combination |
288 | | // keep_high_bytes + shift_right + _mm_or_si128 with |
289 | | // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would |
290 | | // require larger tables for the masks. Now there are |
291 | | // three loads (instead of two) from the mask tables |
292 | | // but they all are from the same cache line. |
293 | 4.64k | v1 = _mm_or_si128(v1, shift_right(v0, size)); |
294 | | |
295 | | // Shift high bytes of v0 away, padding the |
296 | | // low bytes with zeros. |
297 | 4.64k | v0 = shift_left(v0, 16 - size); |
298 | | |
299 | 4.64k | v0 = _mm_xor_si128(v1, fold(v0, fold128)); |
300 | 4.64k | } |
301 | | |
302 | 8.00k | v1 = _mm_srli_si128(v0, 8); |
303 | 8.00k | v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); |
304 | 8.00k | v0 = _mm_xor_si128(v0, v1); |
305 | 8.00k | } |
306 | | |
307 | | // Barrett reduction |
308 | | |
309 | | #if BUILDING_CRC_CLMUL == 32 |
310 | | v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu |
311 | | v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p |
312 | | v0 = _mm_xor_si128(v0, v1); |
313 | | return ~(uint32_t)_mm_extract_epi32(v0, 2); |
314 | | #else |
315 | | // Because p is 65 bits but one bit doesn't fit into the 64-bit |
316 | | // half of __m128i, finish the second clmul by shifting v1 left |
317 | | // by 64 bits and xorring it to the final result. |
318 | | v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu |
319 | | v2 = _mm_slli_si128(v1, 8); |
320 | | v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p |
321 | | v0 = _mm_xor_si128(v0, v2); |
322 | | v0 = _mm_xor_si128(v0, v1); |
323 | | #if defined(__i386__) || defined(_M_IX86) |
324 | | return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | |
325 | | (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); |
326 | | #else |
327 | 8.37k | return ~(uint64_t)_mm_extract_epi64(v0, 1); |
328 | | #endif |
329 | | #endif |
330 | 1.02M | } crc32_fast.c:crc32_arch_optimized Line | Count | Source | 154 | 1.01M | { | 155 | | // We will assume that there is at least one byte of input. | 156 | 1.01M | if (size == 0) | 157 | 110 | return crc; | 158 | | | 159 | | // See crc_clmul_consts_gen.c. | 160 | 1.01M | #if BUILDING_CRC_CLMUL == 32 | 161 | 1.01M | const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95); | 162 | 1.01M | const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191); | 163 | 1.01M | const __m128i mu_p = _mm_set_epi64x( | 164 | 1.01M | (int64_t)0xb4e5b025f7011641, 0x1db710640); | 165 | | #else | 166 | | const __m128i fold512 = _mm_set_epi64x( | 167 | | (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3); | 168 | | | 169 | | const __m128i fold128 = _mm_set_epi64x( | 170 | | (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4); | 171 | | | 172 | | const __m128i mu_p = _mm_set_epi64x( | 173 | | (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84); | 174 | | #endif | 175 | | | 176 | 1.01M | __m128i v0, v1, v2, v3; | 177 | | | 178 | 1.01M | crc = ~crc; | 179 | | | 180 | 1.01M | if (size < 8) { | 181 | 340k | uint64_t x = crc; | 182 | 340k | size_t i = 0; | 183 | | | 184 | | // Checking the bit instead of comparing the size means | 185 | | // that we don't need to update the size between the steps. | 186 | 340k | if (size & 4) { | 187 | 217k | x ^= read32le(buf); | 188 | 217k | buf += 4; | 189 | 217k | i = 32; | 190 | 217k | } | 191 | | | 192 | 340k | if (size & 2) { | 193 | 234k | x ^= (uint64_t)read16le(buf) << i; | 194 | 234k | buf += 2; | 195 | 234k | i += 16; | 196 | 234k | } | 197 | | | 198 | 340k | if (size & 1) | 199 | 655 | x ^= (uint64_t)*buf << i; | 200 | | | 201 | 340k | v0 = my_set_low64((int64_t)x); | 202 | 340k | v0 = shift_left(v0, 8 - size); | 203 | | | 204 | 671k | } else if (size < 16) { | 205 | 668k | v0 = my_set_low64((int64_t)(crc ^ read64le(buf))); | 206 | | | 207 | | // NOTE: buf is intentionally left 8 bytes behind so that | 208 | | // we can read the last 1-7 bytes with read64le(buf + size). | 209 | 668k | size -= 8; | 210 | | | 211 | | // Handling 8-byte input specially is a speed optimization | 212 | | // as the clmul can be skipped. A branch is also needed to | 213 | | // avoid a too high shift amount. | 214 | 668k | if (size > 0) { | 215 | 6.96k | const size_t padding = 8 - size; | 216 | 6.96k | uint64_t high = read64le(buf + size) >> (padding * 8); | 217 | | | 218 | | #if defined(__i386__) || defined(_M_IX86) | 219 | | // Simple but likely not the best code for 32-bit x86. | 220 | | v0 = _mm_insert_epi32(v0, (int32_t)high, 2); | 221 | | v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3); | 222 | | #else | 223 | 6.96k | v0 = _mm_insert_epi64(v0, (int64_t)high, 1); | 224 | 6.96k | #endif | 225 | | | 226 | 6.96k | v0 = shift_left(v0, padding); | 227 | | | 228 | 6.96k | v1 = _mm_srli_si128(v0, 8); | 229 | 6.96k | v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); | 230 | 6.96k | v0 = _mm_xor_si128(v0, v1); | 231 | 6.96k | } | 232 | 668k | } else { | 233 | 3.83k | v0 = my_set_low64((int64_t)crc); | 234 | | | 235 | | // To align or not to align the buf pointer? If the end of | 236 | | // the buffer isn't aligned, aligning the pointer here would | 237 | | // make us do an extra folding step with the associated byte | 238 | | // shuffling overhead. The cost of that would need to be | 239 | | // lower than the benefit of aligned reads. Testing on an old | 240 | | // Intel Ivy Bridge processor suggested that aligning isn't | 241 | | // worth the cost but it likely depends on the processor and | 242 | | // buffer size. Unaligned loads (MOVDQU) should be fast on | 243 | | // x86 processors that support PCLMULQDQ, so we don't align | 244 | | // the buf pointer here. | 245 | | | 246 | | // Read the first (and possibly the only) full 16 bytes. | 247 | 3.83k | v0 = _mm_xor_si128(v0, my_load128(buf)); | 248 | 3.83k | buf += 16; | 249 | 3.83k | size -= 16; | 250 | | | 251 | 3.83k | if (size >= 48) { | 252 | 109 | v1 = my_load128(buf); | 253 | 109 | v2 = my_load128(buf + 16); | 254 | 109 | v3 = my_load128(buf + 32); | 255 | 109 | buf += 48; | 256 | 109 | size -= 48; | 257 | | | 258 | 655 | while (size >= 64) { | 259 | 546 | v0 = fold_xor(v0, fold512, buf); | 260 | 546 | v1 = fold_xor(v1, fold512, buf + 16); | 261 | 546 | v2 = fold_xor(v2, fold512, buf + 32); | 262 | 546 | v3 = fold_xor(v3, fold512, buf + 48); | 263 | 546 | buf += 64; | 264 | 546 | size -= 64; | 265 | 546 | } | 266 | | | 267 | 109 | v0 = _mm_xor_si128(v1, fold(v0, fold128)); | 268 | 109 | v0 = _mm_xor_si128(v2, fold(v0, fold128)); | 269 | 109 | v0 = _mm_xor_si128(v3, fold(v0, fold128)); | 270 | 109 | } | 271 | | | 272 | 5.88k | while (size >= 16) { | 273 | 2.05k | v0 = fold_xor(v0, fold128, buf); | 274 | 2.05k | buf += 16; | 275 | 2.05k | size -= 16; | 276 | 2.05k | } | 277 | | | 278 | 3.83k | if (size > 0) { | 279 | | // We want the last "size" number of input bytes to | 280 | | // be at the high bits of v1. First do a full 16-byte | 281 | | // load and then mask the low bytes to zeros. | 282 | 876 | v1 = my_load128(buf + size - 16); | 283 | 876 | v1 = keep_high_bytes(v1, size); | 284 | | | 285 | | // Shift high bytes from v0 to the low bytes of v1. | 286 | | // | 287 | | // Alternatively we could replace the combination | 288 | | // keep_high_bytes + shift_right + _mm_or_si128 with | 289 | | // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would | 290 | | // require larger tables for the masks. Now there are | 291 | | // three loads (instead of two) from the mask tables | 292 | | // but they all are from the same cache line. | 293 | 876 | v1 = _mm_or_si128(v1, shift_right(v0, size)); | 294 | | | 295 | | // Shift high bytes of v0 away, padding the | 296 | | // low bytes with zeros. | 297 | 876 | v0 = shift_left(v0, 16 - size); | 298 | | | 299 | 876 | v0 = _mm_xor_si128(v1, fold(v0, fold128)); | 300 | 876 | } | 301 | | | 302 | 3.83k | v1 = _mm_srli_si128(v0, 8); | 303 | 3.83k | v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); | 304 | 3.83k | v0 = _mm_xor_si128(v0, v1); | 305 | 3.83k | } | 306 | | | 307 | | // Barrett reduction | 308 | | | 309 | 1.01M | #if BUILDING_CRC_CLMUL == 32 | 310 | 1.01M | v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu | 311 | 1.01M | v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p | 312 | 1.01M | v0 = _mm_xor_si128(v0, v1); | 313 | 1.01M | return ~(uint32_t)_mm_extract_epi32(v0, 2); | 314 | | #else | 315 | | // Because p is 65 bits but one bit doesn't fit into the 64-bit | 316 | | // half of __m128i, finish the second clmul by shifting v1 left | 317 | | // by 64 bits and xorring it to the final result. | 318 | | v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu | 319 | | v2 = _mm_slli_si128(v1, 8); | 320 | | v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p | 321 | | v0 = _mm_xor_si128(v0, v2); | 322 | | v0 = _mm_xor_si128(v0, v1); | 323 | | #if defined(__i386__) || defined(_M_IX86) | 324 | | return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | | 325 | | (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); | 326 | | #else | 327 | | return ~(uint64_t)_mm_extract_epi64(v0, 1); | 328 | | #endif | 329 | | #endif | 330 | 1.01M | } |
crc64_fast.c:crc64_arch_optimized Line | Count | Source | 154 | 8.37k | { | 155 | | // We will assume that there is at least one byte of input. | 156 | 8.37k | if (size == 0) | 157 | 0 | return crc; | 158 | | | 159 | | // See crc_clmul_consts_gen.c. | 160 | | #if BUILDING_CRC_CLMUL == 32 | 161 | | const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95); | 162 | | const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191); | 163 | | const __m128i mu_p = _mm_set_epi64x( | 164 | | (int64_t)0xb4e5b025f7011641, 0x1db710640); | 165 | | #else | 166 | 8.37k | const __m128i fold512 = _mm_set_epi64x( | 167 | 8.37k | (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3); | 168 | | | 169 | 8.37k | const __m128i fold128 = _mm_set_epi64x( | 170 | 8.37k | (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4); | 171 | | | 172 | 8.37k | const __m128i mu_p = _mm_set_epi64x( | 173 | 8.37k | (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84); | 174 | 8.37k | #endif | 175 | | | 176 | 8.37k | __m128i v0, v1, v2, v3; | 177 | | | 178 | 8.37k | crc = ~crc; | 179 | | | 180 | 8.37k | if (size < 8) { | 181 | 2.75k | uint64_t x = crc; | 182 | 2.75k | size_t i = 0; | 183 | | | 184 | | // Checking the bit instead of comparing the size means | 185 | | // that we don't need to update the size between the steps. | 186 | 2.75k | if (size & 4) { | 187 | 1.64k | x ^= read32le(buf); | 188 | 1.64k | buf += 4; | 189 | 1.64k | i = 32; | 190 | 1.64k | } | 191 | | | 192 | 2.75k | if (size & 2) { | 193 | 1.49k | x ^= (uint64_t)read16le(buf) << i; | 194 | 1.49k | buf += 2; | 195 | 1.49k | i += 16; | 196 | 1.49k | } | 197 | | | 198 | 2.75k | if (size & 1) | 199 | 1.27k | x ^= (uint64_t)*buf << i; | 200 | | | 201 | 2.75k | v0 = my_set_low64((int64_t)x); | 202 | 2.75k | v0 = shift_left(v0, 8 - size); | 203 | | | 204 | 5.61k | } else if (size < 16) { | 205 | 1.44k | v0 = my_set_low64((int64_t)(crc ^ read64le(buf))); | 206 | | | 207 | | // NOTE: buf is intentionally left 8 bytes behind so that | 208 | | // we can read the last 1-7 bytes with read64le(buf + size). | 209 | 1.44k | size -= 8; | 210 | | | 211 | | // Handling 8-byte input specially is a speed optimization | 212 | | // as the clmul can be skipped. A branch is also needed to | 213 | | // avoid a too high shift amount. | 214 | 1.44k | if (size > 0) { | 215 | 1.08k | const size_t padding = 8 - size; | 216 | 1.08k | uint64_t high = read64le(buf + size) >> (padding * 8); | 217 | | | 218 | | #if defined(__i386__) || defined(_M_IX86) | 219 | | // Simple but likely not the best code for 32-bit x86. | 220 | | v0 = _mm_insert_epi32(v0, (int32_t)high, 2); | 221 | | v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3); | 222 | | #else | 223 | 1.08k | v0 = _mm_insert_epi64(v0, (int64_t)high, 1); | 224 | 1.08k | #endif | 225 | | | 226 | 1.08k | v0 = shift_left(v0, padding); | 227 | | | 228 | 1.08k | v1 = _mm_srli_si128(v0, 8); | 229 | 1.08k | v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); | 230 | 1.08k | v0 = _mm_xor_si128(v0, v1); | 231 | 1.08k | } | 232 | 4.17k | } else { | 233 | 4.17k | v0 = my_set_low64((int64_t)crc); | 234 | | | 235 | | // To align or not to align the buf pointer? If the end of | 236 | | // the buffer isn't aligned, aligning the pointer here would | 237 | | // make us do an extra folding step with the associated byte | 238 | | // shuffling overhead. The cost of that would need to be | 239 | | // lower than the benefit of aligned reads. Testing on an old | 240 | | // Intel Ivy Bridge processor suggested that aligning isn't | 241 | | // worth the cost but it likely depends on the processor and | 242 | | // buffer size. Unaligned loads (MOVDQU) should be fast on | 243 | | // x86 processors that support PCLMULQDQ, so we don't align | 244 | | // the buf pointer here. | 245 | | | 246 | | // Read the first (and possibly the only) full 16 bytes. | 247 | 4.17k | v0 = _mm_xor_si128(v0, my_load128(buf)); | 248 | 4.17k | buf += 16; | 249 | 4.17k | size -= 16; | 250 | | | 251 | 4.17k | if (size >= 48) { | 252 | 2.62k | v1 = my_load128(buf); | 253 | 2.62k | v2 = my_load128(buf + 16); | 254 | 2.62k | v3 = my_load128(buf + 32); | 255 | 2.62k | buf += 48; | 256 | 2.62k | size -= 48; | 257 | | | 258 | 24.6k | while (size >= 64) { | 259 | 22.0k | v0 = fold_xor(v0, fold512, buf); | 260 | 22.0k | v1 = fold_xor(v1, fold512, buf + 16); | 261 | 22.0k | v2 = fold_xor(v2, fold512, buf + 32); | 262 | 22.0k | v3 = fold_xor(v3, fold512, buf + 48); | 263 | 22.0k | buf += 64; | 264 | 22.0k | size -= 64; | 265 | 22.0k | } | 266 | | | 267 | 2.62k | v0 = _mm_xor_si128(v1, fold(v0, fold128)); | 268 | 2.62k | v0 = _mm_xor_si128(v2, fold(v0, fold128)); | 269 | 2.62k | v0 = _mm_xor_si128(v3, fold(v0, fold128)); | 270 | 2.62k | } | 271 | | | 272 | 8.72k | while (size >= 16) { | 273 | 4.55k | v0 = fold_xor(v0, fold128, buf); | 274 | 4.55k | buf += 16; | 275 | 4.55k | size -= 16; | 276 | 4.55k | } | 277 | | | 278 | 4.17k | if (size > 0) { | 279 | | // We want the last "size" number of input bytes to | 280 | | // be at the high bits of v1. First do a full 16-byte | 281 | | // load and then mask the low bytes to zeros. | 282 | 3.77k | v1 = my_load128(buf + size - 16); | 283 | 3.77k | v1 = keep_high_bytes(v1, size); | 284 | | | 285 | | // Shift high bytes from v0 to the low bytes of v1. | 286 | | // | 287 | | // Alternatively we could replace the combination | 288 | | // keep_high_bytes + shift_right + _mm_or_si128 with | 289 | | // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would | 290 | | // require larger tables for the masks. Now there are | 291 | | // three loads (instead of two) from the mask tables | 292 | | // but they all are from the same cache line. | 293 | 3.77k | v1 = _mm_or_si128(v1, shift_right(v0, size)); | 294 | | | 295 | | // Shift high bytes of v0 away, padding the | 296 | | // low bytes with zeros. | 297 | 3.77k | v0 = shift_left(v0, 16 - size); | 298 | | | 299 | 3.77k | v0 = _mm_xor_si128(v1, fold(v0, fold128)); | 300 | 3.77k | } | 301 | | | 302 | 4.17k | v1 = _mm_srli_si128(v0, 8); | 303 | 4.17k | v0 = _mm_clmulepi64_si128(v0, fold128, 0x10); | 304 | 4.17k | v0 = _mm_xor_si128(v0, v1); | 305 | 4.17k | } | 306 | | | 307 | | // Barrett reduction | 308 | | | 309 | | #if BUILDING_CRC_CLMUL == 32 | 310 | | v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu | 311 | | v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p | 312 | | v0 = _mm_xor_si128(v0, v1); | 313 | | return ~(uint32_t)_mm_extract_epi32(v0, 2); | 314 | | #else | 315 | | // Because p is 65 bits but one bit doesn't fit into the 64-bit | 316 | | // half of __m128i, finish the second clmul by shifting v1 left | 317 | | // by 64 bits and xorring it to the final result. | 318 | 8.37k | v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu | 319 | 8.37k | v2 = _mm_slli_si128(v1, 8); | 320 | 8.37k | v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p | 321 | 8.37k | v0 = _mm_xor_si128(v0, v2); | 322 | 8.37k | v0 = _mm_xor_si128(v0, v1); | 323 | | #if defined(__i386__) || defined(_M_IX86) | 324 | | return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | | 325 | | (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); | 326 | | #else | 327 | 8.37k | return ~(uint64_t)_mm_extract_epi64(v0, 1); | 328 | 8.37k | #endif | 329 | 8.37k | #endif | 330 | 8.37k | } |
|
331 | | |
332 | | |
333 | | // Even though this is an inline function, compile it only when needed. |
334 | | // This way it won't appear in E2K builds at all. |
335 | | #if defined(CRC32_GENERIC) || defined(CRC64_GENERIC) |
336 | | // Inlining this function duplicates the function body in crc32_resolve() and |
337 | | // crc64_resolve(), but this is acceptable because this is a tiny function. |
338 | | static inline bool |
339 | | is_arch_extension_supported(void) |
340 | 12 | { |
341 | 12 | int success = 1; |
342 | 12 | uint32_t r[4]; // eax, ebx, ecx, edx |
343 | | |
344 | | #if defined(_MSC_VER) |
345 | | // This needs <intrin.h> with MSVC. ICC has it as a built-in |
346 | | // on all platforms. |
347 | | __cpuid(r, 1); |
348 | | #elif defined(HAVE_CPUID_H) |
349 | | // Compared to just using __asm__ to run CPUID, this also checks |
350 | | // that CPUID is supported and saves and restores ebx as that is |
351 | | // needed with GCC < 5 with position-independent code (PIC). |
352 | 12 | success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); |
353 | | #else |
354 | | // Just a fallback that shouldn't be needed. |
355 | | __asm__("cpuid\n\t" |
356 | | : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) |
357 | | : "a"(1), "c"(0)); |
358 | | #endif |
359 | | |
360 | | // Returns true if these are supported: |
361 | | // CLMUL (bit 1 in ecx) |
362 | | // SSSE3 (bit 9 in ecx) |
363 | | // SSE4.1 (bit 19 in ecx) |
364 | 12 | const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); |
365 | 12 | return success && (r[2] & ecx_mask) == ecx_mask; |
366 | | |
367 | | // Alternative methods that weren't used: |
368 | | // - ICC's _may_i_use_cpu_feature: the other methods should work too. |
369 | | // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") |
370 | | // |
371 | | // CPUID decoding is needed with MSVC anyway and older GCC. This keeps |
372 | | // the feature checks in the build system simpler too. The nice thing |
373 | | // about __builtin_cpu_supports would be that it generates very short |
374 | | // code as is it only reads a variable set at startup but a few bytes |
375 | | // doesn't matter here. |
376 | 12 | } crc32_fast.c:is_arch_extension_supported Line | Count | Source | 340 | 6 | { | 341 | 6 | int success = 1; | 342 | 6 | uint32_t r[4]; // eax, ebx, ecx, edx | 343 | | | 344 | | #if defined(_MSC_VER) | 345 | | // This needs <intrin.h> with MSVC. ICC has it as a built-in | 346 | | // on all platforms. | 347 | | __cpuid(r, 1); | 348 | | #elif defined(HAVE_CPUID_H) | 349 | | // Compared to just using __asm__ to run CPUID, this also checks | 350 | | // that CPUID is supported and saves and restores ebx as that is | 351 | | // needed with GCC < 5 with position-independent code (PIC). | 352 | 6 | success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); | 353 | | #else | 354 | | // Just a fallback that shouldn't be needed. | 355 | | __asm__("cpuid\n\t" | 356 | | : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) | 357 | | : "a"(1), "c"(0)); | 358 | | #endif | 359 | | | 360 | | // Returns true if these are supported: | 361 | | // CLMUL (bit 1 in ecx) | 362 | | // SSSE3 (bit 9 in ecx) | 363 | | // SSE4.1 (bit 19 in ecx) | 364 | 6 | const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); | 365 | 6 | return success && (r[2] & ecx_mask) == ecx_mask; | 366 | | | 367 | | // Alternative methods that weren't used: | 368 | | // - ICC's _may_i_use_cpu_feature: the other methods should work too. | 369 | | // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") | 370 | | // | 371 | | // CPUID decoding is needed with MSVC anyway and older GCC. This keeps | 372 | | // the feature checks in the build system simpler too. The nice thing | 373 | | // about __builtin_cpu_supports would be that it generates very short | 374 | | // code as is it only reads a variable set at startup but a few bytes | 375 | | // doesn't matter here. | 376 | 6 | } |
crc64_fast.c:is_arch_extension_supported Line | Count | Source | 340 | 6 | { | 341 | 6 | int success = 1; | 342 | 6 | uint32_t r[4]; // eax, ebx, ecx, edx | 343 | | | 344 | | #if defined(_MSC_VER) | 345 | | // This needs <intrin.h> with MSVC. ICC has it as a built-in | 346 | | // on all platforms. | 347 | | __cpuid(r, 1); | 348 | | #elif defined(HAVE_CPUID_H) | 349 | | // Compared to just using __asm__ to run CPUID, this also checks | 350 | | // that CPUID is supported and saves and restores ebx as that is | 351 | | // needed with GCC < 5 with position-independent code (PIC). | 352 | 6 | success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); | 353 | | #else | 354 | | // Just a fallback that shouldn't be needed. | 355 | | __asm__("cpuid\n\t" | 356 | | : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) | 357 | | : "a"(1), "c"(0)); | 358 | | #endif | 359 | | | 360 | | // Returns true if these are supported: | 361 | | // CLMUL (bit 1 in ecx) | 362 | | // SSSE3 (bit 9 in ecx) | 363 | | // SSE4.1 (bit 19 in ecx) | 364 | 6 | const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); | 365 | 6 | return success && (r[2] & ecx_mask) == ecx_mask; | 366 | | | 367 | | // Alternative methods that weren't used: | 368 | | // - ICC's _may_i_use_cpu_feature: the other methods should work too. | 369 | | // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") | 370 | | // | 371 | | // CPUID decoding is needed with MSVC anyway and older GCC. This keeps | 372 | | // the feature checks in the build system simpler too. The nice thing | 373 | | // about __builtin_cpu_supports would be that it generates very short | 374 | | // code as is it only reads a variable set at startup but a few bytes | 375 | | // doesn't matter here. | 376 | 6 | } |
|
377 | | #endif |