/src/gdal/frmts/zarr/crc32c.cpp
Line | Count | Source |
1 | | /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction |
2 | | * Copyright (C) 2013, 2015, 2021 Mark Adler |
3 | | * Version 1.4 31 May 2021 Mark Adler |
4 | | */ |
5 | | |
6 | | // Even Rouault, 2026: Slightly modified to avoid pthread and replacing by |
7 | | // C++11 static thread-safe initialization |
8 | | |
9 | | /* |
10 | | This software is provided 'as-is', without any express or implied |
11 | | warranty. In no event will the author be held liable for any damages |
12 | | arising from the use of this software. |
13 | | |
14 | | Permission is granted to anyone to use this software for any purpose, |
15 | | including commercial applications, and to alter it and redistribute it |
16 | | freely, subject to the following restrictions: |
17 | | |
18 | | 1. The origin of this software must not be misrepresented; you must not |
19 | | claim that you wrote the original software. If you use this software |
20 | | in a product, an acknowledgment in the product documentation would be |
21 | | appreciated but is not required. |
22 | | 2. Altered source versions must be plainly marked as such, and must not be |
23 | | misrepresented as being the original software. |
24 | | 3. This notice may not be removed or altered from any source distribution. |
25 | | |
26 | | Mark Adler |
27 | | madler@alumni.caltech.edu |
28 | | */ |
29 | | |
30 | | /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a |
31 | | CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software |
32 | | version is provided as a fall-back, as well as for speed comparisons. */ |
33 | | |
34 | | /* Version history: |
35 | | 1.0 10 Feb 2013 First version |
36 | | 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel |
37 | | 1.2 1 Nov 2015 Add const qualifier to avoid compiler warning |
38 | | Load entire input into memory (test code) |
39 | | Argument gives number of times to repeat (test code) |
40 | | Argument < 0 forces software implementation (test code) |
41 | | 1.3 31 Dec 2015 Check for Intel architecture using compiler macro |
42 | | Support big-endian processors in software calculation |
43 | | Add header for external use |
44 | | 1.4 31 May 2021 Correct register constraints on assembly instructions |
45 | | */ |
46 | | |
47 | | #include "cpl_port.h" |
48 | | |
49 | | #include "crc32c.h" |
50 | | |
51 | | crc_func crc32c; |
52 | | |
53 | | /* CRC-32C (iSCSI) polynomial in reversed bit order. */ |
54 | 0 | #define POLY 0x82f63b78 |
55 | | |
56 | | uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len); |
57 | | uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len); |
58 | | #ifdef __x86_64__ |
59 | | |
60 | | /* Hardware CRC-32C for Intel and compatible processors. */ |
61 | | |
62 | | /* Multiply a matrix times a vector over the Galois field of two elements, |
63 | | GF(2). Each element is a bit in an unsigned integer. mat must have at |
64 | | least as many entries as the power of two for most significant one bit in |
65 | | vec. */ |
66 | 0 | static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) { |
67 | 0 | uint32_t sum = 0; |
68 | 0 | while (vec) { |
69 | 0 | if (vec & 1) |
70 | 0 | sum ^= *mat; |
71 | 0 | vec >>= 1; |
72 | 0 | mat++; |
73 | 0 | } |
74 | 0 | return sum; |
75 | 0 | } |
76 | | |
77 | | /* Multiply a matrix by itself over GF(2). Both mat and square must have 32 |
78 | | rows. */ |
79 | 0 | static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) { |
80 | 0 | for (unsigned n = 0; n < 32; n++) |
81 | 0 | square[n] = gf2_matrix_times(mat, mat[n]); |
82 | 0 | } |
83 | | |
84 | | /* Construct an operator to apply len zeros to a crc. len must be a power of |
85 | | two. If len is not a power of two, then the result is the same as for the |
86 | | largest power of two less than len. The result for len == 0 is the same as |
87 | | for len == 1. A version of this routine could be easily written for any |
88 | | len, but that is not needed for this application. */ |
89 | 0 | static void crc32c_zeros_op(uint32_t *even, size_t len) { |
90 | 0 | uint32_t odd[32]; /* odd-power-of-two zeros operator */ |
91 | | |
92 | | /* put operator for one zero bit in odd */ |
93 | 0 | odd[0] = POLY; /* CRC-32C polynomial */ |
94 | 0 | uint32_t row = 1; |
95 | 0 | for (unsigned n = 1; n < 32; n++) { |
96 | 0 | odd[n] = row; |
97 | 0 | row <<= 1; |
98 | 0 | } |
99 | | |
100 | | /* put operator for two zero bits in even */ |
101 | 0 | gf2_matrix_square(even, odd); |
102 | | |
103 | | /* put operator for four zero bits in odd */ |
104 | 0 | gf2_matrix_square(odd, even); |
105 | | |
106 | | /* first square will put the operator for one zero byte (eight zero bits), |
107 | | in even -- next square puts operator for two zero bytes in odd, and so |
108 | | on, until len has been rotated down to zero */ |
109 | 0 | do { |
110 | 0 | gf2_matrix_square(even, odd); |
111 | 0 | len >>= 1; |
112 | 0 | if (len == 0) |
113 | 0 | return; |
114 | 0 | gf2_matrix_square(odd, even); |
115 | 0 | len >>= 1; |
116 | 0 | } while (len); |
117 | | |
118 | | /* answer ended up in odd -- copy to even */ |
119 | 0 | for (unsigned n = 0; n < 32; n++) |
120 | 0 | even[n] = odd[n]; |
121 | 0 | } |
122 | | |
123 | | /* Take a length and build four lookup tables for applying the zeros operator |
124 | | for that length, byte-by-byte on the operand. */ |
125 | 0 | static void crc32c_zeros(uint32_t zeros[][256], size_t len) { |
126 | 0 | uint32_t op[32]; |
127 | |
|
128 | 0 | crc32c_zeros_op(op, len); |
129 | 0 | for (unsigned n = 0; n < 256; n++) { |
130 | 0 | zeros[0][n] = gf2_matrix_times(op, n); |
131 | 0 | zeros[1][n] = gf2_matrix_times(op, n << 8); |
132 | 0 | zeros[2][n] = gf2_matrix_times(op, n << 16); |
133 | 0 | zeros[3][n] = gf2_matrix_times(op, n << 24); |
134 | 0 | } |
135 | 0 | } |
136 | | |
137 | | /* Apply the zeros operator table to crc. */ |
138 | 0 | static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) { |
139 | 0 | return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ |
140 | 0 | zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24]; |
141 | 0 | } |
142 | | |
143 | | /* Block sizes for three-way parallel crc computation. LONG and SHORT must |
144 | | both be powers of two. The associated string constants must be set |
145 | | accordingly, for use in constructing the assembler instructions. */ |
146 | 0 | #define LONG 8192 |
147 | | #define LONGx1 "8192" |
148 | | #define LONGx2 "16384" |
149 | 0 | #define SHORT 256 |
150 | | #define SHORTx1 "256" |
151 | | #define SHORTx2 "512" |
152 | | |
153 | | /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */ |
154 | | static uint32_t crc32c_long[4][256]; |
155 | | static uint32_t crc32c_short[4][256]; |
156 | | |
157 | | /* Initialize tables for shifting crcs. */ |
158 | 0 | static void crc32c_init_hw(void) { |
159 | 0 | crc32c_zeros(crc32c_long, LONG); |
160 | 0 | crc32c_zeros(crc32c_short, SHORT); |
161 | 0 | } |
162 | | |
163 | | /* Compute CRC-32C using the Intel hardware instruction. */ |
164 | 0 | static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) { |
165 | | /* populate shift tables the first time through */ |
166 | 0 | static bool bInit = [](bool ret) |
167 | 0 | { |
168 | 0 | crc32c_init_hw(); |
169 | 0 | return ret; |
170 | 0 | }(true); |
171 | 0 | (void)bInit; |
172 | | |
173 | | /* pre-process the crc */ |
174 | 0 | crc = ~crc; |
175 | 0 | uint64_t crc0 = crc; /* 64-bits for crc32q instruction */ |
176 | | |
177 | | /* compute the crc for up to seven leading bytes to bring the data pointer |
178 | | to an eight-byte boundary */ |
179 | 0 | unsigned char const *next = static_cast<unsigned char const*>(buf); |
180 | 0 | while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) { |
181 | 0 | __asm__("crc32b\t" "(%1), %0" |
182 | 0 | : "+r"(crc0) |
183 | 0 | : "r"(next), "m"(*next)); |
184 | 0 | next++; |
185 | 0 | len--; |
186 | 0 | } |
187 | | |
188 | | /* compute the crc on sets of LONG*3 bytes, executing three independent crc |
189 | | instructions, each on LONG bytes -- this is optimized for the Nehalem, |
190 | | Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a |
191 | | throughput of one crc per cycle, but a latency of three cycles */ |
192 | 0 | while (len >= LONG*3) { |
193 | 0 | uint64_t crc1 = 0; |
194 | 0 | uint64_t crc2 = 0; |
195 | 0 | unsigned char const * const end = next + LONG; |
196 | 0 | do { |
197 | 0 | __asm__("crc32q\t" "(%3), %0\n\t" |
198 | 0 | "crc32q\t" LONGx1 "(%3), %1\n\t" |
199 | 0 | "crc32q\t" LONGx2 "(%3), %2" |
200 | 0 | : "+r"(crc0), "+r"(crc1), "+r"(crc2) |
201 | 0 | : "r"(next), "m"(*next)); |
202 | 0 | next += 8; |
203 | 0 | } while (next < end); |
204 | 0 | crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc1; |
205 | 0 | crc0 = crc32c_shift(crc32c_long, static_cast<uint32_t>(crc0)) ^ crc2; |
206 | 0 | next += LONG*2; |
207 | 0 | len -= LONG*3; |
208 | 0 | } |
209 | | |
210 | | /* do the same thing, but now on SHORT*3 blocks for the remaining data less |
211 | | than a LONG*3 block */ |
212 | 0 | while (len >= SHORT*3) { |
213 | 0 | uint64_t crc1 = 0; |
214 | 0 | uint64_t crc2 = 0; |
215 | 0 | unsigned char const * const end = next + SHORT; |
216 | 0 | do { |
217 | 0 | __asm__("crc32q\t" "(%3), %0\n\t" |
218 | 0 | "crc32q\t" SHORTx1 "(%3), %1\n\t" |
219 | 0 | "crc32q\t" SHORTx2 "(%3), %2" |
220 | 0 | : "+r"(crc0), "+r"(crc1), "+r"(crc2) |
221 | 0 | : "r"(next), "m"(*next)); |
222 | 0 | next += 8; |
223 | 0 | } while (next < end); |
224 | 0 | crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc1; |
225 | 0 | crc0 = crc32c_shift(crc32c_short, static_cast<uint32_t>(crc0)) ^ crc2; |
226 | 0 | next += SHORT*2; |
227 | 0 | len -= SHORT*3; |
228 | 0 | } |
229 | | |
230 | | /* compute the crc on the remaining eight-byte units less than a SHORT*3 |
231 | | block */ |
232 | 0 | { |
233 | 0 | unsigned char const * const end = next + (len - (len & 7)); |
234 | 0 | while (next < end) { |
235 | 0 | __asm__("crc32q\t" "(%1), %0" |
236 | 0 | : "+r"(crc0) |
237 | 0 | : "r"(next), "m"(*next)); |
238 | 0 | next += 8; |
239 | 0 | } |
240 | 0 | len &= 7; |
241 | 0 | } |
242 | | |
243 | | /* compute the crc for up to seven trailing bytes */ |
244 | 0 | while (len) { |
245 | 0 | __asm__("crc32b\t" "(%1), %0" |
246 | 0 | : "+r"(crc0) |
247 | 0 | : "r"(next), "m"(*next)); |
248 | 0 | next++; |
249 | 0 | len--; |
250 | 0 | } |
251 | | |
252 | | /* return a post-processed crc */ |
253 | 0 | return ~static_cast<uint32_t>(crc0); |
254 | 0 | } |
255 | | |
256 | | /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors |
257 | | introduced in November, 2008. This does not check for the existence of the |
258 | | cpuid instruction itself, which was introduced on the 486SL in 1992, so this |
259 | | will fail on earlier x86 processors. cpuid works on all Pentium and later |
260 | | processors. */ |
261 | | #define SSE42(have) \ |
262 | 0 | do { \ |
263 | 0 | uint32_t eax, ecx; \ |
264 | 0 | eax = 1; \ |
265 | 0 | __asm__("cpuid" \ |
266 | 0 | : "=c"(ecx) \ |
267 | 0 | : "a"(eax) \ |
268 | 0 | : "%ebx", "%edx"); \ |
269 | 0 | (have) = (ecx >> 20) & 1; \ |
270 | 0 | } while (0) |
271 | | |
272 | | /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware |
273 | | version. Otherwise, use the software version. */ |
274 | 0 | void crc32c_init(void) { |
275 | |
|
276 | 0 | static const bool bInit = [](bool ret){ |
277 | 0 | int sse42; |
278 | |
|
279 | 0 | SSE42(sse42); |
280 | 0 | if (sse42) { |
281 | 0 | crc32c = crc32c_hw; |
282 | 0 | } else { |
283 | 0 | crc32c = crc32c_sw; |
284 | 0 | } |
285 | 0 | return ret; |
286 | 0 | }(true); |
287 | 0 | (void)bInit; |
288 | 0 | } |
289 | | |
290 | | #elif defined(__aarch64__) && (defined(__linux__) || defined(__APPLE__)) |
291 | | #if defined(__linux__) && defined(HAVE_SYS_AUX_H) |
292 | | #include <sys/auxv.h> |
293 | | #elif defined(__APPLE__) |
294 | | #include <sys/sysctl.h> |
295 | | #endif |
296 | | |
297 | | #if defined(HWCAP_CRC32) |
298 | | static inline uint32_t crc32cx(uint32_t crc, const uint64_t data) |
299 | | { |
300 | | asm(".arch_extension crc\n" |
301 | | "crc32cx %w0, %w0, %x1" : "+r" (crc) : "r" (data)); |
302 | | return crc; |
303 | | } |
304 | | |
305 | | static inline uint32_t crc32cb(uint32_t crc, const uint8_t data) |
306 | | { |
307 | | asm(".arch_extension crc\n" |
308 | | "crc32cb %w0, %w0, %w1" : "+r" (crc) : "r" (data)); |
309 | | return crc; |
310 | | } |
311 | | |
312 | | static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) { |
313 | | crc = ~crc; |
314 | | unsigned char const *next = buf; |
315 | | |
316 | | while (((uintptr_t)next & 7) && len > 0) { |
317 | | crc = crc32cb(crc, *(uint8_t *)next); |
318 | | next++; |
319 | | len--; |
320 | | } |
321 | | |
322 | | while (len >= 64) { |
323 | | uint64_t *next8 = (uint64_t *)next; |
324 | | crc = crc32cx(crc, next8[0]); |
325 | | crc = crc32cx(crc, next8[1]); |
326 | | crc = crc32cx(crc, next8[2]); |
327 | | crc = crc32cx(crc, next8[3]); |
328 | | crc = crc32cx(crc, next8[4]); |
329 | | crc = crc32cx(crc, next8[5]); |
330 | | crc = crc32cx(crc, next8[6]); |
331 | | crc = crc32cx(crc, next8[7]); |
332 | | next += 64; |
333 | | len -= 64; |
334 | | } |
335 | | |
336 | | while (len >= 8) { |
337 | | crc = crc32cx(crc, *(uint64_t *)next); |
338 | | next += 8; |
339 | | len -= 8; |
340 | | } |
341 | | |
342 | | while (len > 0) { |
343 | | crc = crc32cb(crc, *(uint8_t *)next); |
344 | | next++; |
345 | | len--; |
346 | | } |
347 | | |
348 | | return ~crc; |
349 | | } |
350 | | |
351 | | void crc32c_init(void) { |
352 | | #if defined(__linux__) |
353 | | uint64_t auxv = getauxval(AT_HWCAP); |
354 | | |
355 | | crc32c = crc32c_sw; |
356 | | if (auxv & HWCAP_CRC32) |
357 | | crc32c = crc32c_hw; |
358 | | #elif defined(__APPLE__) |
359 | | int armv8_crc32; |
360 | | size_t size = sizeof(armv8_crc32); |
361 | | |
362 | | if (sysctlbyname("hw.optional.armv8_crc32", &armv8_crc32, &size, NULL, 0) == 0 && |
363 | | armv8_crc32 == 1) |
364 | | crc32c = crc32c_hw; |
365 | | #endif |
366 | | } |
367 | | #else /* no hw crc32 on arm64 system supported? old compiler/libc/kernel? */ |
368 | | void crc32c_init(void) { |
369 | | crc32c = crc32c_sw; |
370 | | } |
371 | | #endif |
372 | | #else /* !__x86_64__i && !__aarch64__ */ |
373 | | void crc32c_init(void) { |
374 | | crc32c = crc32c_sw; |
375 | | } |
376 | | |
377 | | #endif |
378 | | |
379 | | #if CPL_IS_LSB |
380 | | |
381 | | /* Construct table for software CRC-32C little-endian calculation. */ |
382 | | |
383 | | static uint32_t crc32c_table_little[8][256]; |
384 | 0 | static void crc32c_init_sw_little(void) { |
385 | 0 | for (unsigned n = 0; n < 256; n++) { |
386 | 0 | uint32_t crc = n; |
387 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
388 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
389 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
390 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
391 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
392 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
393 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
394 | 0 | crc = (crc & 1) ? (crc >> 1) ^ POLY : crc >> 1; |
395 | 0 | crc32c_table_little[0][n] = crc; |
396 | 0 | } |
397 | 0 | for (unsigned n = 0; n < 256; n++) { |
398 | 0 | uint32_t crc = crc32c_table_little[0][n]; |
399 | 0 | for (unsigned k = 1; k < 8; k++) { |
400 | 0 | crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8); |
401 | 0 | crc32c_table_little[k][n] = crc; |
402 | 0 | } |
403 | 0 | } |
404 | 0 | } |
405 | | |
406 | | /* Compute a CRC-32C in software assuming a little-endian architecture, |
407 | | constructing the required table if that hasn't already been done. */ |
408 | 0 | uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) { |
409 | 0 | unsigned char const *next = static_cast<unsigned char const*>(buf); |
410 | |
|
411 | 0 | static bool bInit = [](bool ret) |
412 | 0 | { |
413 | 0 | crc32c_init_sw_little(); |
414 | 0 | return ret; |
415 | 0 | }(true); |
416 | 0 | (void)bInit; |
417 | |
|
418 | 0 | crc = ~crc; |
419 | 0 | while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) { |
420 | 0 | crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); |
421 | 0 | len--; |
422 | 0 | } |
423 | 0 | if (len >= 8) { |
424 | 0 | uint64_t crcw = crc; |
425 | 0 | do { |
426 | 0 | crcw ^= *reinterpret_cast<uint64_t const *>(next); |
427 | 0 | crcw = crc32c_table_little[7][crcw & 0xff] ^ |
428 | 0 | crc32c_table_little[6][(crcw >> 8) & 0xff] ^ |
429 | 0 | crc32c_table_little[5][(crcw >> 16) & 0xff] ^ |
430 | 0 | crc32c_table_little[4][(crcw >> 24) & 0xff] ^ |
431 | 0 | crc32c_table_little[3][(crcw >> 32) & 0xff] ^ |
432 | 0 | crc32c_table_little[2][(crcw >> 40) & 0xff] ^ |
433 | 0 | crc32c_table_little[1][(crcw >> 48) & 0xff] ^ |
434 | 0 | crc32c_table_little[0][crcw >> 56]; |
435 | 0 | next += 8; |
436 | 0 | len -= 8; |
437 | 0 | } while (len >= 8); |
438 | 0 | crc = static_cast<uint32_t>(crcw); |
439 | 0 | } |
440 | 0 | while (len) { |
441 | 0 | crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); |
442 | 0 | len--; |
443 | 0 | } |
444 | 0 | return ~crc; |
445 | 0 | } |
446 | | |
447 | | #else |
448 | | |
449 | | /* Swap the bytes in a uint64_t. (Only for big-endian.) */ |
450 | | #if defined(__has_builtin) || (defined(__GNUC__) && \ |
451 | | (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) |
452 | | # define swap __builtin_bswap64 |
453 | | #else |
454 | | static inline uint64_t swap(uint64_t x) { |
455 | | x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff); |
456 | | x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff); |
457 | | return (x << 32) | (x >> 32); |
458 | | } |
459 | | #endif |
460 | | |
461 | | /* Construct tables for software CRC-32C big-endian calculation. */ |
462 | | |
463 | | static uint32_t crc32c_table_big_byte[256]; |
464 | | static uint64_t crc32c_table_big[8][256]; |
465 | | static void crc32c_init_sw_big(void) { |
466 | | for (unsigned n = 0; n < 256; n++) { |
467 | | uint32_t crc = n; |
468 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
469 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
470 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
471 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
472 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
473 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
474 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
475 | | crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; |
476 | | crc32c_table_big_byte[n] = crc; |
477 | | } |
478 | | for (unsigned n = 0; n < 256; n++) { |
479 | | uint32_t crc = crc32c_table_big_byte[n]; |
480 | | crc32c_table_big[0][n] = swap(crc); |
481 | | for (unsigned k = 1; k < 8; k++) { |
482 | | crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8); |
483 | | crc32c_table_big[k][n] = swap(crc); |
484 | | } |
485 | | } |
486 | | } |
487 | | |
488 | | /* Compute a CRC-32C in software assuming a big-endian architecture, |
489 | | constructing the required tables if that hasn't already been done. */ |
490 | | uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) { |
491 | | unsigned char const *next = reinterpret_cast<unsigned char const*>(buf); |
492 | | |
493 | | static bool bInit = []() |
494 | | { |
495 | | crc32c_init_sw_big(); |
496 | | return true; |
497 | | }(); |
498 | | (void)bInit; |
499 | | |
500 | | crc = ~crc; |
501 | | while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) { |
502 | | crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8); |
503 | | len--; |
504 | | } |
505 | | if (len >= 8) { |
506 | | uint64_t crcw = swap(crc); |
507 | | do { |
508 | | crcw ^= *reinterpret_cast<uint64_t const *>(next); |
509 | | crcw = crc32c_table_big[0][crcw & 0xff] ^ |
510 | | crc32c_table_big[1][(crcw >> 8) & 0xff] ^ |
511 | | crc32c_table_big[2][(crcw >> 16) & 0xff] ^ |
512 | | crc32c_table_big[3][(crcw >> 24) & 0xff] ^ |
513 | | crc32c_table_big[4][(crcw >> 32) & 0xff] ^ |
514 | | crc32c_table_big[5][(crcw >> 40) & 0xff] ^ |
515 | | crc32c_table_big[6][(crcw >> 48) & 0xff] ^ |
516 | | crc32c_table_big[7][(crcw >> 56)]; |
517 | | next += 8; |
518 | | len -= 8; |
519 | | } while (len >= 8); |
520 | | crc = swap(crcw); |
521 | | } |
522 | | while (len) { |
523 | | crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8); |
524 | | len--; |
525 | | } |
526 | | return ~crc; |
527 | | } |
528 | | |
529 | | #endif |
530 | | |
531 | | /* Table-driven software CRC-32C. This is about 15 times slower than using the |
532 | | hardware instructions. Determine the endianness of the processor and proceed |
533 | | accordingly. Ideally the endianness will be determined at compile time, in |
534 | | which case the unused functions and tables for the other endianness will be |
535 | | removed by the optimizer. If not, then the proper routines and tables will |
536 | | be used, even if the endianness is changed mid-stream. (Yes, there are |
537 | | processors that permit that -- go figure.) */ |
538 | 0 | uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) { |
539 | 0 | #if CPL_IS_LSB |
540 | 0 | return crc32c_sw_little(crc, buf, len); |
541 | | #else |
542 | | return crc32c_sw_big(crc, buf, len); |
543 | | #endif |
544 | 0 | } |