/src/php-src/ext/standard/crc32_x86.c
Line | Count | Source |
1 | | /* |
2 | | +----------------------------------------------------------------------+ |
3 | | | Copyright © The PHP Group and Contributors. | |
4 | | +----------------------------------------------------------------------+ |
5 | | | This source file is subject to the Modified BSD License that is | |
6 | | | bundled with this package in the file LICENSE, and is available | |
7 | | | through the World Wide Web at <https://www.php.net/license/>. | |
8 | | | | |
9 | | | SPDX-License-Identifier: BSD-3-Clause | |
10 | | +----------------------------------------------------------------------+ |
11 | | | Author: Frank Du <frank.du@intel.com> | |
12 | | +----------------------------------------------------------------------+ |
13 | | | Compute the crc32 of the buffer. Based on: | |
14 | | | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ" | |
15 | | | V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 | |
16 | | */ |
17 | | |
18 | | #include "crc32_x86.h" |
19 | | |
20 | | #if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) |
21 | | # include <nmmintrin.h> |
22 | | # include <wmmintrin.h> |
23 | | #endif |
24 | | |
25 | | #ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER |
26 | | # include "Zend/zend_cpuinfo.h" |
27 | | #endif |
28 | | |
29 | | #if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) |
30 | | |
31 | | typedef struct _crc32_pclmul_bit_consts { |
32 | | uint64_t k1k2[2]; |
33 | | uint64_t k3k4[2]; |
34 | | uint64_t k5k6[2]; |
35 | | uint64_t uPx[2]; |
36 | | } crc32_pclmul_consts; |
37 | | |
38 | | static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = { |
39 | | { /* X86_CRC32, polynomial: 0x04C11DB7 */ |
40 | | {0x00e6228b11, 0x008833794c}, /* endianness swap */ |
41 | | {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */ |
42 | | {0x00490d678d, 0x00f200aa66}, /* endianness swap */ |
43 | | {0x0104d101df, 0x0104c11db7} |
44 | | }, |
45 | | { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */ |
46 | | {0x0154442bd4, 0x01c6e41596}, |
47 | | {0x01751997d0, 0x00ccaa009e}, |
48 | | {0x0163cd6124, 0x01db710640}, |
49 | | {0x01f7011641, 0x01db710641}, |
50 | | }, |
51 | | { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */ |
52 | | {0x00740eef02, 0x009e4addf8}, |
53 | | {0x00f20c0dfe, 0x014cd00bd6}, |
54 | | {0x00dd45aab8, 0x0000000000}, |
55 | | {0x00dea713f1, 0x0105ec76f0} |
56 | | } |
57 | | }; |
58 | | |
59 | | static uint8_t pclmul_shuf_mask_table[16] = { |
60 | | 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, |
61 | | 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, |
62 | | }; |
63 | | |
64 | | /* Folding of 128-bit data chunks */ |
65 | 321 | #define CRC32_FOLDING_BLOCK_SIZE (16) |
66 | | |
67 | | /* PCLMUL version of non-reflected crc32 */ |
68 | | ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)); |
69 | | size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts) |
70 | 0 | { |
71 | 0 | size_t nr_in = nr; |
72 | 0 | __m128i x0, x1, x2, k, shuf_mask; |
73 | |
|
74 | 0 | if (nr < CRC32_FOLDING_BLOCK_SIZE) { |
75 | 0 | return 0; |
76 | 0 | } |
77 | | |
78 | 0 | shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table)); |
79 | 0 | x0 = _mm_cvtsi32_si128(*crc); |
80 | 0 | x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
81 | 0 | x0 = _mm_slli_si128(x0, 12); |
82 | 0 | x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */ |
83 | 0 | x0 = _mm_xor_si128(x1, x0); |
84 | 0 | p += CRC32_FOLDING_BLOCK_SIZE; |
85 | 0 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
86 | |
|
87 | 0 | if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) { |
88 | 0 | __m128i x3, x4; |
89 | |
|
90 | 0 | x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
91 | 0 | x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */ |
92 | 0 | x2 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
93 | 0 | x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */ |
94 | 0 | x3 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
95 | 0 | x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */ |
96 | 0 | p += CRC32_FOLDING_BLOCK_SIZE * 3; |
97 | 0 | nr -= CRC32_FOLDING_BLOCK_SIZE * 3; |
98 | |
|
99 | 0 | k = _mm_loadu_si128((__m128i *)consts->k1k2); |
100 | | /* parallel folding by 4 */ |
101 | 0 | while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) { |
102 | 0 | __m128i x5, x6, x7, x8, x9, x10, x11; |
103 | 0 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
104 | 0 | x5 = _mm_clmulepi64_si128(x1, k, 0x00); |
105 | 0 | x6 = _mm_clmulepi64_si128(x2, k, 0x00); |
106 | 0 | x7 = _mm_clmulepi64_si128(x3, k, 0x00); |
107 | 0 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
108 | 0 | x1 = _mm_clmulepi64_si128(x1, k, 0x11); |
109 | 0 | x2 = _mm_clmulepi64_si128(x2, k, 0x11); |
110 | 0 | x3 = _mm_clmulepi64_si128(x3, k, 0x11); |
111 | 0 | x8 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
112 | 0 | x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */ |
113 | 0 | x9 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
114 | 0 | x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */ |
115 | 0 | x10 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
116 | 0 | x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */ |
117 | 0 | x11 = _mm_loadu_si128((__m128i *)(p + 0x30)); |
118 | 0 | x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */ |
119 | 0 | x0 = _mm_xor_si128(x0, x4); |
120 | 0 | x1 = _mm_xor_si128(x1, x5); |
121 | 0 | x2 = _mm_xor_si128(x2, x6); |
122 | 0 | x3 = _mm_xor_si128(x3, x7); |
123 | 0 | x0 = _mm_xor_si128(x0, x8); |
124 | 0 | x1 = _mm_xor_si128(x1, x9); |
125 | 0 | x2 = _mm_xor_si128(x2, x10); |
126 | 0 | x3 = _mm_xor_si128(x3, x11); |
127 | |
|
128 | 0 | p += CRC32_FOLDING_BLOCK_SIZE * 4; |
129 | 0 | nr -= CRC32_FOLDING_BLOCK_SIZE * 4; |
130 | 0 | } |
131 | |
|
132 | 0 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
133 | | /* fold 4 to 1, [x1, x2, x3] -> x0 */ |
134 | 0 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
135 | 0 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
136 | 0 | x0 = _mm_xor_si128(x0, x1); |
137 | 0 | x0 = _mm_xor_si128(x0, x4); |
138 | 0 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
139 | 0 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
140 | 0 | x0 = _mm_xor_si128(x0, x2); |
141 | 0 | x0 = _mm_xor_si128(x0, x4); |
142 | 0 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
143 | 0 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
144 | 0 | x0 = _mm_xor_si128(x0, x3); |
145 | 0 | x0 = _mm_xor_si128(x0, x4); |
146 | 0 | } |
147 | |
|
148 | 0 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
149 | | /* folding by 1 */ |
150 | 0 | while (nr >= CRC32_FOLDING_BLOCK_SIZE) { |
151 | | /* load next to x2, fold to x0, x1 */ |
152 | 0 | x2 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
153 | 0 | x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */ |
154 | 0 | x1 = _mm_clmulepi64_si128(x0, k, 0x00); |
155 | 0 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
156 | 0 | x0 = _mm_xor_si128(x0, x2); |
157 | 0 | x0 = _mm_xor_si128(x0, x1); |
158 | 0 | p += CRC32_FOLDING_BLOCK_SIZE; |
159 | 0 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
160 | 0 | } |
161 | | |
162 | | /* reduce 128-bits(final fold) to 96-bits */ |
163 | 0 | k = _mm_loadu_si128((__m128i*)consts->k5k6); |
164 | 0 | x1 = _mm_clmulepi64_si128(x0, k, 0x11); |
165 | 0 | x0 = _mm_slli_si128(x0, 8); |
166 | 0 | x0 = _mm_srli_si128(x0, 4); |
167 | 0 | x0 = _mm_xor_si128(x0, x1); |
168 | | /* reduce 96-bits to 64-bits */ |
169 | 0 | x1 = _mm_clmulepi64_si128(x0, k, 0x01); |
170 | 0 | x0 = _mm_xor_si128(x0, x1); |
171 | | |
172 | | /* barrett reduction */ |
173 | 0 | k = _mm_loadu_si128((__m128i*)consts->uPx); |
174 | 0 | x1 = _mm_move_epi64(x0); |
175 | 0 | x1 = _mm_srli_si128(x1, 4); |
176 | 0 | x1 = _mm_clmulepi64_si128(x1, k, 0x00); |
177 | 0 | x1 = _mm_srli_si128(x1, 4); |
178 | 0 | x1 = _mm_clmulepi64_si128(x1, k, 0x10); |
179 | 0 | x0 = _mm_xor_si128(x1, x0); |
180 | 0 | *crc = _mm_extract_epi32(x0, 0); |
181 | 0 | return (nr_in - nr); /* the nr processed */ |
182 | 0 | } |
183 | | |
184 | | /* PCLMUL version of reflected crc32 */ |
185 | | ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)); |
186 | | size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts) |
187 | 63 | { |
188 | 63 | size_t nr_in = nr; |
189 | 63 | __m128i x0, x1, x2, k; |
190 | | |
191 | 63 | if (nr < CRC32_FOLDING_BLOCK_SIZE) { |
192 | 33 | return 0; |
193 | 33 | } |
194 | | |
195 | 30 | x0 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
196 | 30 | x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc)); |
197 | 30 | p += CRC32_FOLDING_BLOCK_SIZE; |
198 | 30 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
199 | 30 | if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) { |
200 | 3 | __m128i x3, x4; |
201 | | |
202 | 3 | x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
203 | 3 | x2 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
204 | 3 | x3 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
205 | 3 | p += CRC32_FOLDING_BLOCK_SIZE * 3; |
206 | 3 | nr -= CRC32_FOLDING_BLOCK_SIZE * 3; |
207 | | |
208 | 3 | k = _mm_loadu_si128((__m128i *)consts->k1k2); |
209 | | /* parallel folding by 4 */ |
210 | 3 | while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) { |
211 | 0 | __m128i x5, x6, x7, x8, x9, x10, x11; |
212 | 0 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
213 | 0 | x5 = _mm_clmulepi64_si128(x1, k, 0x00); |
214 | 0 | x6 = _mm_clmulepi64_si128(x2, k, 0x00); |
215 | 0 | x7 = _mm_clmulepi64_si128(x3, k, 0x00); |
216 | 0 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
217 | 0 | x1 = _mm_clmulepi64_si128(x1, k, 0x11); |
218 | 0 | x2 = _mm_clmulepi64_si128(x2, k, 0x11); |
219 | 0 | x3 = _mm_clmulepi64_si128(x3, k, 0x11); |
220 | 0 | x8 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
221 | 0 | x9 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
222 | 0 | x10 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
223 | 0 | x11 = _mm_loadu_si128((__m128i *)(p + 0x30)); |
224 | 0 | x0 = _mm_xor_si128(x0, x4); |
225 | 0 | x1 = _mm_xor_si128(x1, x5); |
226 | 0 | x2 = _mm_xor_si128(x2, x6); |
227 | 0 | x3 = _mm_xor_si128(x3, x7); |
228 | 0 | x0 = _mm_xor_si128(x0, x8); |
229 | 0 | x1 = _mm_xor_si128(x1, x9); |
230 | 0 | x2 = _mm_xor_si128(x2, x10); |
231 | 0 | x3 = _mm_xor_si128(x3, x11); |
232 | |
|
233 | 0 | p += CRC32_FOLDING_BLOCK_SIZE * 4; |
234 | 0 | nr -= CRC32_FOLDING_BLOCK_SIZE * 4; |
235 | 0 | } |
236 | | |
237 | 3 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
238 | | /* fold 4 to 1, [x1, x2, x3] -> x0 */ |
239 | 3 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
240 | 3 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
241 | 3 | x0 = _mm_xor_si128(x0, x1); |
242 | 3 | x0 = _mm_xor_si128(x0, x4); |
243 | 3 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
244 | 3 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
245 | 3 | x0 = _mm_xor_si128(x0, x2); |
246 | 3 | x0 = _mm_xor_si128(x0, x4); |
247 | 3 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
248 | 3 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
249 | 3 | x0 = _mm_xor_si128(x0, x3); |
250 | 3 | x0 = _mm_xor_si128(x0, x4); |
251 | 3 | } |
252 | | |
253 | 30 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
254 | | /* folding by 1 */ |
255 | 73 | while (nr >= CRC32_FOLDING_BLOCK_SIZE) { |
256 | | /* load next to x2, fold to x0, x1 */ |
257 | 43 | x2 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
258 | 43 | x1 = _mm_clmulepi64_si128(x0, k, 0x00); |
259 | 43 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
260 | 43 | x0 = _mm_xor_si128(x0, x2); |
261 | 43 | x0 = _mm_xor_si128(x0, x1); |
262 | 43 | p += CRC32_FOLDING_BLOCK_SIZE; |
263 | 43 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
264 | 43 | } |
265 | | |
266 | | /* reduce 128-bits(final fold) to 96-bits */ |
267 | 30 | x1 = _mm_clmulepi64_si128(x0, k, 0x10); |
268 | 30 | x0 = _mm_srli_si128(x0, 8); |
269 | 30 | x0 = _mm_xor_si128(x0, x1); |
270 | | /* reduce 96-bits to 64-bits */ |
271 | 30 | x1 = _mm_shuffle_epi32(x0, 0xfc); |
272 | 30 | x0 = _mm_shuffle_epi32(x0, 0xf9); |
273 | 30 | k = _mm_loadu_si128((__m128i*)consts->k5k6); |
274 | 30 | x1 = _mm_clmulepi64_si128(x1, k, 0x00); |
275 | 30 | x0 = _mm_xor_si128(x0, x1); |
276 | | |
277 | | /* barrett reduction */ |
278 | 30 | x1 = _mm_shuffle_epi32(x0, 0xf3); |
279 | 30 | x0 = _mm_slli_si128(x0, 4); |
280 | 30 | k = _mm_loadu_si128((__m128i*)consts->uPx); |
281 | 30 | x1 = _mm_clmulepi64_si128(x1, k, 0x00); |
282 | 30 | x1 = _mm_clmulepi64_si128(x1, k, 0x10); |
283 | 30 | x0 = _mm_xor_si128(x1, x0); |
284 | 30 | *crc = _mm_extract_epi32(x0, 2); |
285 | 30 | return (nr_in - nr); /* the nr processed */ |
286 | 63 | } |
287 | | |
288 | | # if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) |
289 | | size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) |
290 | | # else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */ |
291 | | size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) |
292 | | # endif |
293 | 63 | { |
294 | 63 | if (type > X86_CRC32_MAX) { |
295 | 0 | return 0; |
296 | 0 | } |
297 | 63 | const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type]; |
298 | | |
299 | 63 | switch (type) { |
300 | 0 | case X86_CRC32: |
301 | 0 | return crc32_pclmul_batch(crc, p, nr, consts); |
302 | 63 | case X86_CRC32B: |
303 | 63 | case X86_CRC32C: |
304 | 63 | return crc32_pclmul_reflected_batch(crc, p, nr, consts); |
305 | 0 | default: |
306 | 0 | return 0; |
307 | 63 | } |
308 | 63 | } |
309 | | #endif |
310 | | |
311 | | #ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER |
312 | | static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) |
313 | 0 | { |
314 | 0 | return 0; |
315 | 0 | } |
316 | | |
317 | | # ifdef ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO |
318 | | size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update"))); |
319 | | |
320 | | typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr); |
321 | | |
322 | | ZEND_NO_SANITIZE_ADDRESS |
323 | | ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */ |
324 | 2 | static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(void) { |
325 | 2 | if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) { |
326 | 2 | return crc32_sse42_pclmul_update; |
327 | 2 | } |
328 | 0 | return crc32_x86_simd_update_default; |
329 | 2 | } |
330 | | # else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */ |
331 | | static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default; |
332 | | |
333 | | size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) { |
334 | | return crc32_x86_simd_ptr(type, crc, p, nr); |
335 | | } |
336 | | |
337 | | /* {{{ PHP_MINIT_FUNCTION */ |
338 | | PHP_MINIT_FUNCTION(crc32_x86_intrin) |
339 | | { |
340 | | if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) { |
341 | | crc32_x86_simd_ptr = crc32_sse42_pclmul_update; |
342 | | } |
343 | | return SUCCESS; |
344 | | } |
345 | | /* }}} */ |
346 | | # endif |
347 | | #endif |