/src/CMake/Utilities/cmliblzma/liblzma/check/crc32_fast.c
Line | Count | Source |
1 | | // SPDX-License-Identifier: 0BSD |
2 | | |
3 | | /////////////////////////////////////////////////////////////////////////////// |
4 | | // |
5 | | /// \file crc32.c |
6 | | /// \brief CRC32 calculation |
7 | | // |
8 | | // Authors: Lasse Collin |
9 | | // Ilya Kurdyukov |
10 | | // Hans Jansen |
11 | | // |
12 | | /////////////////////////////////////////////////////////////////////////////// |
13 | | |
14 | | #include "check.h" |
15 | | #include "crc_common.h" |
16 | | |
17 | | #if defined(CRC_X86_CLMUL) |
18 | | # define BUILDING_CRC32_CLMUL |
19 | | # include "crc_x86_clmul.h" |
20 | | #elif defined(CRC32_ARM64) |
21 | | # include "crc32_arm64.h" |
22 | | #endif |
23 | | |
24 | | |
25 | | #ifdef CRC32_GENERIC |
26 | | |
27 | | /////////////////// |
28 | | // Generic CRC32 // |
29 | | /////////////////// |
30 | | |
31 | | static uint32_t |
32 | | crc32_generic(const uint8_t *buf, size_t size, uint32_t crc) |
33 | 1.74k | { |
34 | 1.74k | crc = ~crc; |
35 | | |
36 | | #ifdef WORDS_BIGENDIAN |
37 | | crc = byteswap32(crc); |
38 | | #endif |
39 | | |
40 | 1.74k | if (size > 8) { |
41 | | // Fix the alignment, if needed. The if statement above |
42 | | // ensures that this won't read past the end of buf[]. |
43 | 918 | while ((uintptr_t)(buf) & 7) { |
44 | 8 | crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc); |
45 | 8 | --size; |
46 | 8 | } |
47 | | |
48 | | // Calculate the position where to stop. |
49 | 910 | const uint8_t *const limit = buf + (size & ~(size_t)(7)); |
50 | | |
51 | | // Calculate how many bytes must be calculated separately |
52 | | // before returning the result. |
53 | 910 | size &= (size_t)(7); |
54 | | |
55 | | // Calculate the CRC32 using the slice-by-eight algorithm. |
56 | 2.80M | while (buf < limit) { |
57 | 2.80M | crc ^= aligned_read32ne(buf); |
58 | 2.80M | buf += 4; |
59 | | |
60 | 2.80M | crc = lzma_crc32_table[7][A(crc)] |
61 | 2.80M | ^ lzma_crc32_table[6][B(crc)] |
62 | 2.80M | ^ lzma_crc32_table[5][C(crc)] |
63 | 2.80M | ^ lzma_crc32_table[4][D(crc)]; |
64 | | |
65 | 2.80M | const uint32_t tmp = aligned_read32ne(buf); |
66 | 2.80M | buf += 4; |
67 | | |
68 | | // At least with some compilers, it is critical for |
69 | | // performance, that the crc variable is XORed |
70 | | // between the two table-lookup pairs. |
71 | 2.80M | crc = lzma_crc32_table[3][A(tmp)] |
72 | 2.80M | ^ lzma_crc32_table[2][B(tmp)] |
73 | 2.80M | ^ crc |
74 | 2.80M | ^ lzma_crc32_table[1][C(tmp)] |
75 | 2.80M | ^ lzma_crc32_table[0][D(tmp)]; |
76 | 2.80M | } |
77 | 910 | } |
78 | | |
79 | 4.32k | while (size-- != 0) |
80 | 2.57k | crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc); |
81 | | |
82 | | #ifdef WORDS_BIGENDIAN |
83 | | crc = byteswap32(crc); |
84 | | #endif |
85 | | |
86 | 1.74k | return ~crc; |
87 | 1.74k | } |
88 | | #endif |
89 | | |
90 | | |
91 | | #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) |
92 | | |
93 | | ////////////////////////// |
94 | | // Function dispatching // |
95 | | ////////////////////////// |
96 | | |
97 | | // If both the generic and arch-optimized implementations are built, then |
98 | | // the function to use is selected at runtime because the system running |
99 | | // the binary might not have the arch-specific instruction set extension(s) |
100 | | // available. The dispatch methods in order of priority: |
101 | | // |
102 | | // 1. Constructor. This method uses __attribute__((__constructor__)) to |
103 | | // set crc32_func at load time. This avoids extra computation (and any |
104 | | // unlikely threading bugs) on the first call to lzma_crc32() to decide |
105 | | // which implementation should be used. |
106 | | // |
107 | | // 2. First Call Resolution. On the very first call to lzma_crc32(), the |
108 | | // call will be directed to crc32_dispatch() instead. This will set the |
109 | | // appropriate implementation function and will not be called again. |
110 | | // This method does not use any kind of locking but is safe because if |
111 | | // multiple threads run the dispatcher simultaneously then they will all |
112 | | // set crc32_func to the same value. |
113 | | |
114 | | typedef uint32_t (*crc32_func_type)( |
115 | | const uint8_t *buf, size_t size, uint32_t crc); |
116 | | |
117 | | // This resolver is shared between all dispatch methods. |
118 | | static crc32_func_type |
119 | | crc32_resolve(void) |
120 | | { |
121 | | return is_arch_extension_supported() |
122 | | ? &crc32_arch_optimized : &crc32_generic; |
123 | | } |
124 | | |
125 | | |
126 | | #ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR |
127 | | // Constructor method. |
128 | | # define CRC32_SET_FUNC_ATTR __attribute__((__constructor__)) |
129 | | static crc32_func_type crc32_func; |
130 | | #else |
131 | | // First Call Resolution method. |
132 | | # define CRC32_SET_FUNC_ATTR |
133 | | static uint32_t crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc); |
134 | | static crc32_func_type crc32_func = &crc32_dispatch; |
135 | | #endif |
136 | | |
137 | | CRC32_SET_FUNC_ATTR |
138 | | static void |
139 | | crc32_set_func(void) |
140 | | { |
141 | | crc32_func = crc32_resolve(); |
142 | | return; |
143 | | } |
144 | | |
145 | | #ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR |
146 | | static uint32_t |
147 | | crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc) |
148 | | { |
149 | | // When __attribute__((__constructor__)) isn't supported, set the |
150 | | // function pointer without any locking. If multiple threads run |
151 | | // the detection code in parallel, they will all end up setting |
152 | | // the pointer to the same value. This avoids the use of |
153 | | // mythread_once() on every call to lzma_crc32() but this likely |
154 | | // isn't strictly standards compliant. Let's change it if it breaks. |
155 | | crc32_set_func(); |
156 | | return crc32_func(buf, size, crc); |
157 | | } |
158 | | |
159 | | #endif |
160 | | #endif |
161 | | |
162 | | |
163 | | extern LZMA_API(uint32_t) |
164 | | lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) |
165 | 1.74k | { |
166 | | #if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) |
167 | | // On x86-64, if CLMUL is available, it is the best for non-tiny |
168 | | // inputs, being over twice as fast as the generic slice-by-four |
169 | | // version. However, for size <= 16 it's different. In the extreme |
170 | | // case of size == 1 the generic version can be five times faster. |
171 | | // At size >= 8 the CLMUL starts to become reasonable. It |
172 | | // varies depending on the alignment of buf too. |
173 | | // |
174 | | // The above doesn't include the overhead of mythread_once(). |
175 | | // At least on x86-64 GNU/Linux, pthread_once() is very fast but |
176 | | // it still makes lzma_crc32(buf, 1, crc) 50-100 % slower. When |
177 | | // size reaches 12-16 bytes the overhead becomes negligible. |
178 | | // |
179 | | // So using the generic version for size <= 16 may give better |
180 | | // performance with tiny inputs but if such inputs happen rarely |
181 | | // it's not so obvious because then the lookup table of the |
182 | | // generic version may not be in the processor cache. |
183 | | #ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS |
184 | | if (size <= 16) |
185 | | return crc32_generic(buf, size, crc); |
186 | | #endif |
187 | | |
188 | | /* |
189 | | #ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR |
190 | | // See crc32_dispatch(). This would be the alternative which uses |
191 | | // locking and doesn't use crc32_dispatch(). Note that on Windows |
192 | | // this method needs Vista threads. |
193 | | mythread_once(crc64_set_func); |
194 | | #endif |
195 | | */ |
196 | | return crc32_func(buf, size, crc); |
197 | | |
198 | | #elif defined(CRC32_ARCH_OPTIMIZED) |
199 | | return crc32_arch_optimized(buf, size, crc); |
200 | | |
201 | | #else |
202 | 1.74k | return crc32_generic(buf, size, crc); |
203 | 1.74k | #endif |
204 | 1.74k | } |