/src/php-src/ext/standard/crc32_x86.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | +----------------------------------------------------------------------+ |
3 | | | Copyright (c) The PHP Group | |
4 | | +----------------------------------------------------------------------+ |
5 | | | This source file is subject to version 3.01 of the PHP license, | |
6 | | | that is bundled with this package in the file LICENSE, and is | |
7 | | | available through the world-wide-web at the following url: | |
8 | | | https://www.php.net/license/3_01.txt | |
9 | | | If you did not receive a copy of the PHP license and are unable to | |
10 | | | obtain it through the world-wide-web, please send a note to | |
11 | | | license@php.net so we can mail you a copy immediately. | |
12 | | +----------------------------------------------------------------------+ |
13 | | | Author: Frank Du <frank.du@intel.com> | |
14 | | +----------------------------------------------------------------------+ |
15 | | | Compute the crc32 of the buffer. Based on: | |
16 | | | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ" | |
17 | | | V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 | |
18 | | */ |
19 | | |
20 | | #include "crc32_x86.h" |
21 | | |
22 | | #if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) |
23 | | # include <nmmintrin.h> |
24 | | # include <wmmintrin.h> |
25 | | #endif |
26 | | |
27 | | #ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER |
28 | | # include "Zend/zend_cpuinfo.h" |
29 | | #endif |
30 | | |
31 | | #if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER) |
32 | | |
33 | | typedef struct _crc32_pclmul_bit_consts { |
34 | | uint64_t k1k2[2]; |
35 | | uint64_t k3k4[2]; |
36 | | uint64_t k5k6[2]; |
37 | | uint64_t uPx[2]; |
38 | | } crc32_pclmul_consts; |
39 | | |
40 | | static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = { |
41 | | { /* X86_CRC32, polynomial: 0x04C11DB7 */ |
42 | | {0x00e6228b11, 0x008833794c}, /* endianness swap */ |
43 | | {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */ |
44 | | {0x00490d678d, 0x00f200aa66}, /* endianness swap */ |
45 | | {0x0104d101df, 0x0104c11db7} |
46 | | }, |
47 | | { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */ |
48 | | {0x0154442bd4, 0x01c6e41596}, |
49 | | {0x01751997d0, 0x00ccaa009e}, |
50 | | {0x0163cd6124, 0x01db710640}, |
51 | | {0x01f7011641, 0x01db710641}, |
52 | | }, |
53 | | { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */ |
54 | | {0x00740eef02, 0x009e4addf8}, |
55 | | {0x00f20c0dfe, 0x014cd00bd6}, |
56 | | {0x00dd45aab8, 0x0000000000}, |
57 | | {0x00dea713f1, 0x0105ec76f0} |
58 | | } |
59 | | }; |
60 | | |
61 | | static uint8_t pclmul_shuf_mask_table[16] = { |
62 | | 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, |
63 | | 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, |
64 | | }; |
65 | | |
66 | | /* Folding of 128-bit data chunks */ |
67 | 6.17k | #define CRC32_FOLDING_BLOCK_SIZE (16) |
68 | | |
69 | | /* PCLMUL version of non-reflected crc32 */ |
70 | | ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)); |
71 | | size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts) |
72 | 23 | { |
73 | 23 | size_t nr_in = nr; |
74 | 23 | __m128i x0, x1, x2, k, shuf_mask; |
75 | | |
76 | 23 | if (nr < CRC32_FOLDING_BLOCK_SIZE) { |
77 | 6 | return 0; |
78 | 6 | } |
79 | | |
80 | 17 | shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table)); |
81 | 17 | x0 = _mm_cvtsi32_si128(*crc); |
82 | 17 | x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
83 | 17 | x0 = _mm_slli_si128(x0, 12); |
84 | 17 | x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */ |
85 | 17 | x0 = _mm_xor_si128(x1, x0); |
86 | 17 | p += CRC32_FOLDING_BLOCK_SIZE; |
87 | 17 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
88 | | |
89 | 17 | if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) { |
90 | 14 | __m128i x3, x4; |
91 | | |
92 | 14 | x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
93 | 14 | x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */ |
94 | 14 | x2 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
95 | 14 | x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */ |
96 | 14 | x3 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
97 | 14 | x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */ |
98 | 14 | p += CRC32_FOLDING_BLOCK_SIZE * 3; |
99 | 14 | nr -= CRC32_FOLDING_BLOCK_SIZE * 3; |
100 | | |
101 | 14 | k = _mm_loadu_si128((__m128i *)consts->k1k2); |
102 | | /* parallel folding by 4 */ |
103 | 1.40k | while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) { |
104 | 1.39k | __m128i x5, x6, x7, x8, x9, x10, x11; |
105 | 1.39k | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
106 | 1.39k | x5 = _mm_clmulepi64_si128(x1, k, 0x00); |
107 | 1.39k | x6 = _mm_clmulepi64_si128(x2, k, 0x00); |
108 | 1.39k | x7 = _mm_clmulepi64_si128(x3, k, 0x00); |
109 | 1.39k | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
110 | 1.39k | x1 = _mm_clmulepi64_si128(x1, k, 0x11); |
111 | 1.39k | x2 = _mm_clmulepi64_si128(x2, k, 0x11); |
112 | 1.39k | x3 = _mm_clmulepi64_si128(x3, k, 0x11); |
113 | 1.39k | x8 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
114 | 1.39k | x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */ |
115 | 1.39k | x9 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
116 | 1.39k | x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */ |
117 | 1.39k | x10 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
118 | 1.39k | x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */ |
119 | 1.39k | x11 = _mm_loadu_si128((__m128i *)(p + 0x30)); |
120 | 1.39k | x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */ |
121 | 1.39k | x0 = _mm_xor_si128(x0, x4); |
122 | 1.39k | x1 = _mm_xor_si128(x1, x5); |
123 | 1.39k | x2 = _mm_xor_si128(x2, x6); |
124 | 1.39k | x3 = _mm_xor_si128(x3, x7); |
125 | 1.39k | x0 = _mm_xor_si128(x0, x8); |
126 | 1.39k | x1 = _mm_xor_si128(x1, x9); |
127 | 1.39k | x2 = _mm_xor_si128(x2, x10); |
128 | 1.39k | x3 = _mm_xor_si128(x3, x11); |
129 | | |
130 | 1.39k | p += CRC32_FOLDING_BLOCK_SIZE * 4; |
131 | 1.39k | nr -= CRC32_FOLDING_BLOCK_SIZE * 4; |
132 | 1.39k | } |
133 | | |
134 | 14 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
135 | | /* fold 4 to 1, [x1, x2, x3] -> x0 */ |
136 | 14 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
137 | 14 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
138 | 14 | x0 = _mm_xor_si128(x0, x1); |
139 | 14 | x0 = _mm_xor_si128(x0, x4); |
140 | 14 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
141 | 14 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
142 | 14 | x0 = _mm_xor_si128(x0, x2); |
143 | 14 | x0 = _mm_xor_si128(x0, x4); |
144 | 14 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
145 | 14 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
146 | 14 | x0 = _mm_xor_si128(x0, x3); |
147 | 14 | x0 = _mm_xor_si128(x0, x4); |
148 | 14 | } |
149 | | |
150 | 17 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
151 | | /* folding by 1 */ |
152 | 27 | while (nr >= CRC32_FOLDING_BLOCK_SIZE) { |
153 | | /* load next to x2, fold to x0, x1 */ |
154 | 10 | x2 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
155 | 10 | x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */ |
156 | 10 | x1 = _mm_clmulepi64_si128(x0, k, 0x00); |
157 | 10 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
158 | 10 | x0 = _mm_xor_si128(x0, x2); |
159 | 10 | x0 = _mm_xor_si128(x0, x1); |
160 | 10 | p += CRC32_FOLDING_BLOCK_SIZE; |
161 | 10 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
162 | 10 | } |
163 | | |
164 | | /* reduce 128-bits(final fold) to 96-bits */ |
165 | 17 | k = _mm_loadu_si128((__m128i*)consts->k5k6); |
166 | 17 | x1 = _mm_clmulepi64_si128(x0, k, 0x11); |
167 | 17 | x0 = _mm_slli_si128(x0, 8); |
168 | 17 | x0 = _mm_srli_si128(x0, 4); |
169 | 17 | x0 = _mm_xor_si128(x0, x1); |
170 | | /* reduce 96-bits to 64-bits */ |
171 | 17 | x1 = _mm_clmulepi64_si128(x0, k, 0x01); |
172 | 17 | x0 = _mm_xor_si128(x0, x1); |
173 | | |
174 | | /* barrett reduction */ |
175 | 17 | k = _mm_loadu_si128((__m128i*)consts->uPx); |
176 | 17 | x1 = _mm_move_epi64(x0); |
177 | 17 | x1 = _mm_srli_si128(x1, 4); |
178 | 17 | x1 = _mm_clmulepi64_si128(x1, k, 0x00); |
179 | 17 | x1 = _mm_srli_si128(x1, 4); |
180 | 17 | x1 = _mm_clmulepi64_si128(x1, k, 0x10); |
181 | 17 | x0 = _mm_xor_si128(x1, x0); |
182 | 17 | *crc = _mm_extract_epi32(x0, 0); |
183 | 17 | return (nr_in - nr); /* the nr processed */ |
184 | 23 | } |
185 | | |
186 | | /* PCLMUL version of reflected crc32 */ |
187 | | ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)); |
188 | | size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts) |
189 | 67 | { |
190 | 67 | size_t nr_in = nr; |
191 | 67 | __m128i x0, x1, x2, k; |
192 | | |
193 | 67 | if (nr < CRC32_FOLDING_BLOCK_SIZE) { |
194 | 26 | return 0; |
195 | 26 | } |
196 | | |
197 | 41 | x0 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
198 | 41 | x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc)); |
199 | 41 | p += CRC32_FOLDING_BLOCK_SIZE; |
200 | 41 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
201 | 41 | if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) { |
202 | 24 | __m128i x3, x4; |
203 | | |
204 | 24 | x1 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
205 | 24 | x2 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
206 | 24 | x3 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
207 | 24 | p += CRC32_FOLDING_BLOCK_SIZE * 3; |
208 | 24 | nr -= CRC32_FOLDING_BLOCK_SIZE * 3; |
209 | | |
210 | 24 | k = _mm_loadu_si128((__m128i *)consts->k1k2); |
211 | | /* parallel folding by 4 */ |
212 | 486 | while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) { |
213 | 462 | __m128i x5, x6, x7, x8, x9, x10, x11; |
214 | 462 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
215 | 462 | x5 = _mm_clmulepi64_si128(x1, k, 0x00); |
216 | 462 | x6 = _mm_clmulepi64_si128(x2, k, 0x00); |
217 | 462 | x7 = _mm_clmulepi64_si128(x3, k, 0x00); |
218 | 462 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
219 | 462 | x1 = _mm_clmulepi64_si128(x1, k, 0x11); |
220 | 462 | x2 = _mm_clmulepi64_si128(x2, k, 0x11); |
221 | 462 | x3 = _mm_clmulepi64_si128(x3, k, 0x11); |
222 | 462 | x8 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
223 | 462 | x9 = _mm_loadu_si128((__m128i *)(p + 0x10)); |
224 | 462 | x10 = _mm_loadu_si128((__m128i *)(p + 0x20)); |
225 | 462 | x11 = _mm_loadu_si128((__m128i *)(p + 0x30)); |
226 | 462 | x0 = _mm_xor_si128(x0, x4); |
227 | 462 | x1 = _mm_xor_si128(x1, x5); |
228 | 462 | x2 = _mm_xor_si128(x2, x6); |
229 | 462 | x3 = _mm_xor_si128(x3, x7); |
230 | 462 | x0 = _mm_xor_si128(x0, x8); |
231 | 462 | x1 = _mm_xor_si128(x1, x9); |
232 | 462 | x2 = _mm_xor_si128(x2, x10); |
233 | 462 | x3 = _mm_xor_si128(x3, x11); |
234 | | |
235 | 462 | p += CRC32_FOLDING_BLOCK_SIZE * 4; |
236 | 462 | nr -= CRC32_FOLDING_BLOCK_SIZE * 4; |
237 | 462 | } |
238 | | |
239 | 24 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
240 | | /* fold 4 to 1, [x1, x2, x3] -> x0 */ |
241 | 24 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
242 | 24 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
243 | 24 | x0 = _mm_xor_si128(x0, x1); |
244 | 24 | x0 = _mm_xor_si128(x0, x4); |
245 | 24 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
246 | 24 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
247 | 24 | x0 = _mm_xor_si128(x0, x2); |
248 | 24 | x0 = _mm_xor_si128(x0, x4); |
249 | 24 | x4 = _mm_clmulepi64_si128(x0, k, 0x00); |
250 | 24 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
251 | 24 | x0 = _mm_xor_si128(x0, x3); |
252 | 24 | x0 = _mm_xor_si128(x0, x4); |
253 | 24 | } |
254 | | |
255 | 41 | k = _mm_loadu_si128((__m128i *)consts->k3k4); |
256 | | /* folding by 1 */ |
257 | 89 | while (nr >= CRC32_FOLDING_BLOCK_SIZE) { |
258 | | /* load next to x2, fold to x0, x1 */ |
259 | 48 | x2 = _mm_loadu_si128((__m128i *)(p + 0x00)); |
260 | 48 | x1 = _mm_clmulepi64_si128(x0, k, 0x00); |
261 | 48 | x0 = _mm_clmulepi64_si128(x0, k, 0x11); |
262 | 48 | x0 = _mm_xor_si128(x0, x2); |
263 | 48 | x0 = _mm_xor_si128(x0, x1); |
264 | 48 | p += CRC32_FOLDING_BLOCK_SIZE; |
265 | 48 | nr -= CRC32_FOLDING_BLOCK_SIZE; |
266 | 48 | } |
267 | | |
268 | | /* reduce 128-bits(final fold) to 96-bits */ |
269 | 41 | x1 = _mm_clmulepi64_si128(x0, k, 0x10); |
270 | 41 | x0 = _mm_srli_si128(x0, 8); |
271 | 41 | x0 = _mm_xor_si128(x0, x1); |
272 | | /* reduce 96-bits to 64-bits */ |
273 | 41 | x1 = _mm_shuffle_epi32(x0, 0xfc); |
274 | 41 | x0 = _mm_shuffle_epi32(x0, 0xf9); |
275 | 41 | k = _mm_loadu_si128((__m128i*)consts->k5k6); |
276 | 41 | x1 = _mm_clmulepi64_si128(x1, k, 0x00); |
277 | 41 | x0 = _mm_xor_si128(x0, x1); |
278 | | |
279 | | /* barrett reduction */ |
280 | 41 | x1 = _mm_shuffle_epi32(x0, 0xf3); |
281 | 41 | x0 = _mm_slli_si128(x0, 4); |
282 | 41 | k = _mm_loadu_si128((__m128i*)consts->uPx); |
283 | 41 | x1 = _mm_clmulepi64_si128(x1, k, 0x00); |
284 | 41 | x1 = _mm_clmulepi64_si128(x1, k, 0x10); |
285 | 41 | x0 = _mm_xor_si128(x1, x0); |
286 | 41 | *crc = _mm_extract_epi32(x0, 2); |
287 | 41 | return (nr_in - nr); /* the nr processed */ |
288 | 67 | } |
289 | | |
290 | | # if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) |
291 | | size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) |
292 | | # else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */ |
293 | | size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) |
294 | | # endif |
295 | 90 | { |
296 | 90 | if (type > X86_CRC32_MAX) { |
297 | 0 | return 0; |
298 | 0 | } |
299 | 90 | const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type]; |
300 | | |
301 | 90 | switch (type) { |
302 | 23 | case X86_CRC32: |
303 | 23 | return crc32_pclmul_batch(crc, p, nr, consts); |
304 | 47 | case X86_CRC32B: |
305 | 67 | case X86_CRC32C: |
306 | 67 | return crc32_pclmul_reflected_batch(crc, p, nr, consts); |
307 | 0 | default: |
308 | 0 | return 0; |
309 | 90 | } |
310 | 90 | } |
311 | | #endif |
312 | | |
313 | | #ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER |
314 | | static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) |
315 | 0 | { |
316 | 0 | return 0; |
317 | 0 | } |
318 | | |
319 | | # ifdef ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO |
320 | | size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update"))); |
321 | | |
322 | | typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr); |
323 | | |
324 | | ZEND_NO_SANITIZE_ADDRESS |
325 | | ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */ |
326 | | static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(void) { |
327 | | if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) { |
328 | | return crc32_sse42_pclmul_update; |
329 | | } |
330 | | return crc32_x86_simd_update_default; |
331 | | } |
332 | | # else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */ |
333 | | static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default; |
334 | | |
335 | 90 | size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) { |
336 | 90 | return crc32_x86_simd_ptr(type, crc, p, nr); |
337 | 90 | } |
338 | | |
339 | | /* {{{ PHP_MINIT_FUNCTION */ |
340 | | PHP_MINIT_FUNCTION(crc32_x86_intrin) |
341 | 16 | { |
342 | 16 | if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) { |
343 | 16 | crc32_x86_simd_ptr = crc32_sse42_pclmul_update; |
344 | 16 | } |
345 | 16 | return SUCCESS; |
346 | 16 | } |
347 | | /* }}} */ |
348 | | # endif |
349 | | #endif |