Coverage Report

Created: 2026-06-02 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/standard/crc32_x86.c
Line
Count
Source
1
/*
2
  +----------------------------------------------------------------------+
3
  | Copyright © The PHP Group and Contributors.                          |
4
  +----------------------------------------------------------------------+
5
  | This source file is subject to the Modified BSD License that is      |
6
  | bundled with this package in the file LICENSE, and is available      |
7
  | through the World Wide Web at <https://www.php.net/license/>.        |
8
  |                                                                      |
9
  | SPDX-License-Identifier: BSD-3-Clause                                |
10
  +----------------------------------------------------------------------+
11
  | Author: Frank Du <frank.du@intel.com>                                |
12
  +----------------------------------------------------------------------+
13
  | Compute the crc32 of the buffer. Based on:                           |
14
  | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ"       |
15
  |  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0          |
16
*/
17
18
#include "crc32_x86.h"
19
20
#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
21
# include <nmmintrin.h>
22
# include <wmmintrin.h>
23
#endif
24
25
#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
26
# include "Zend/zend_cpuinfo.h"
27
#endif
28
29
#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
30
31
typedef struct _crc32_pclmul_bit_consts {
32
  uint64_t k1k2[2];
33
  uint64_t k3k4[2];
34
  uint64_t k5k6[2];
35
  uint64_t uPx[2];
36
} crc32_pclmul_consts;
37
38
static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
39
  { /* X86_CRC32, polynomial: 0x04C11DB7 */
40
    {0x00e6228b11, 0x008833794c}, /* endianness swap */
41
    {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
42
    {0x00490d678d, 0x00f200aa66}, /* endianness swap */
43
    {0x0104d101df, 0x0104c11db7}
44
  },
45
  { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
46
    {0x0154442bd4, 0x01c6e41596},
47
    {0x01751997d0, 0x00ccaa009e},
48
    {0x0163cd6124, 0x01db710640},
49
    {0x01f7011641, 0x01db710641},
50
  },
51
  { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
52
    {0x00740eef02, 0x009e4addf8},
53
    {0x00f20c0dfe, 0x014cd00bd6},
54
    {0x00dd45aab8, 0x0000000000},
55
    {0x00dea713f1, 0x0105ec76f0}
56
  }
57
};
58
59
static uint8_t pclmul_shuf_mask_table[16] = {
60
  0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
61
  0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
62
};
63
64
/* Folding of 128-bit data chunks */
65
169k
#define CRC32_FOLDING_BLOCK_SIZE (16)
66
67
/* PCLMUL version of non-reflected crc32 */
68
ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
69
size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
70
35
{
71
35
  size_t nr_in = nr;
72
35
  __m128i x0, x1, x2, k, shuf_mask;
73
74
35
  if (nr < CRC32_FOLDING_BLOCK_SIZE) {
75
3
    return 0;
76
3
  }
77
78
32
  shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
79
32
  x0 = _mm_cvtsi32_si128(*crc);
80
32
  x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
81
32
  x0 = _mm_slli_si128(x0, 12);
82
32
  x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
83
32
  x0 = _mm_xor_si128(x1, x0);
84
32
  p += CRC32_FOLDING_BLOCK_SIZE;
85
32
  nr -= CRC32_FOLDING_BLOCK_SIZE;
86
87
32
  if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
88
27
    __m128i x3, x4;
89
90
27
    x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
91
27
    x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
92
27
    x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
93
27
    x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
94
27
    x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
95
27
    x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
96
27
    p += CRC32_FOLDING_BLOCK_SIZE * 3;
97
27
    nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
98
99
27
    k = _mm_loadu_si128((__m128i *)consts->k1k2);
100
    /* parallel folding by 4 */
101
29.1k
    while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
102
29.1k
      __m128i x5, x6, x7, x8, x9, x10, x11;
103
29.1k
      x4 = _mm_clmulepi64_si128(x0, k, 0x00);
104
29.1k
      x5 = _mm_clmulepi64_si128(x1, k, 0x00);
105
29.1k
      x6 = _mm_clmulepi64_si128(x2, k, 0x00);
106
29.1k
      x7 = _mm_clmulepi64_si128(x3, k, 0x00);
107
29.1k
      x0 = _mm_clmulepi64_si128(x0, k, 0x11);
108
29.1k
      x1 = _mm_clmulepi64_si128(x1, k, 0x11);
109
29.1k
      x2 = _mm_clmulepi64_si128(x2, k, 0x11);
110
29.1k
      x3 = _mm_clmulepi64_si128(x3, k, 0x11);
111
29.1k
      x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
112
29.1k
      x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
113
29.1k
      x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
114
29.1k
      x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
115
29.1k
      x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
116
29.1k
      x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
117
29.1k
      x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
118
29.1k
      x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
119
29.1k
      x0 = _mm_xor_si128(x0, x4);
120
29.1k
      x1 = _mm_xor_si128(x1, x5);
121
29.1k
      x2 = _mm_xor_si128(x2, x6);
122
29.1k
      x3 = _mm_xor_si128(x3, x7);
123
29.1k
      x0 = _mm_xor_si128(x0, x8);
124
29.1k
      x1 = _mm_xor_si128(x1, x9);
125
29.1k
      x2 = _mm_xor_si128(x2, x10);
126
29.1k
      x3 = _mm_xor_si128(x3, x11);
127
128
29.1k
      p += CRC32_FOLDING_BLOCK_SIZE * 4;
129
29.1k
      nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
130
29.1k
    }
131
132
27
    k = _mm_loadu_si128((__m128i *)consts->k3k4);
133
    /* fold 4 to 1, [x1, x2, x3] -> x0 */
134
27
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
135
27
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
136
27
    x0 = _mm_xor_si128(x0, x1);
137
27
    x0 = _mm_xor_si128(x0, x4);
138
27
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
139
27
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
140
27
    x0 = _mm_xor_si128(x0, x2);
141
27
    x0 = _mm_xor_si128(x0, x4);
142
27
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
143
27
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
144
27
    x0 = _mm_xor_si128(x0, x3);
145
27
    x0 = _mm_xor_si128(x0, x4);
146
27
  }
147
148
32
  k = _mm_loadu_si128((__m128i *)consts->k3k4);
149
  /* folding by 1 */
150
59
  while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
151
    /* load next to x2, fold to x0, x1 */
152
27
    x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
153
27
    x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
154
27
    x1 = _mm_clmulepi64_si128(x0, k, 0x00);
155
27
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
156
27
    x0 = _mm_xor_si128(x0, x2);
157
27
    x0 = _mm_xor_si128(x0, x1);
158
27
    p += CRC32_FOLDING_BLOCK_SIZE;
159
27
    nr -= CRC32_FOLDING_BLOCK_SIZE;
160
27
  }
161
162
  /* reduce 128-bits(final fold) to 96-bits */
163
32
  k = _mm_loadu_si128((__m128i*)consts->k5k6);
164
32
  x1 = _mm_clmulepi64_si128(x0, k, 0x11);
165
32
  x0 = _mm_slli_si128(x0, 8);
166
32
  x0 = _mm_srli_si128(x0, 4);
167
32
  x0 = _mm_xor_si128(x0, x1);
168
  /* reduce 96-bits to 64-bits */
169
32
  x1 = _mm_clmulepi64_si128(x0, k, 0x01);
170
32
  x0 = _mm_xor_si128(x0, x1);
171
172
  /* barrett reduction */
173
32
  k = _mm_loadu_si128((__m128i*)consts->uPx);
174
32
  x1 = _mm_move_epi64(x0);
175
32
  x1 = _mm_srli_si128(x1, 4);
176
32
  x1 = _mm_clmulepi64_si128(x1, k, 0x00);
177
32
  x1 = _mm_srli_si128(x1, 4);
178
32
  x1 = _mm_clmulepi64_si128(x1, k, 0x10);
179
32
  x0 = _mm_xor_si128(x1, x0);
180
32
  *crc =  _mm_extract_epi32(x0, 0);
181
32
  return (nr_in - nr); /* the nr processed */
182
35
}
183
184
/* PCLMUL version of reflected crc32 */
185
ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
186
size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
187
32
{
188
32
  size_t nr_in = nr;
189
32
  __m128i x0, x1, x2, k;
190
191
32
  if (nr < CRC32_FOLDING_BLOCK_SIZE) {
192
9
    return 0;
193
9
  }
194
195
23
  x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
196
23
  x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
197
23
  p += CRC32_FOLDING_BLOCK_SIZE;
198
23
  nr -= CRC32_FOLDING_BLOCK_SIZE;
199
23
  if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
200
19
    __m128i x3, x4;
201
202
19
    x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
203
19
    x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
204
19
    x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
205
19
    p += CRC32_FOLDING_BLOCK_SIZE * 3;
206
19
    nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
207
208
19
    k = _mm_loadu_si128((__m128i *)consts->k1k2);
209
    /* parallel folding by 4 */
210
27.1k
    while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
211
27.1k
      __m128i x5, x6, x7, x8, x9, x10, x11;
212
27.1k
      x4 = _mm_clmulepi64_si128(x0, k, 0x00);
213
27.1k
      x5 = _mm_clmulepi64_si128(x1, k, 0x00);
214
27.1k
      x6 = _mm_clmulepi64_si128(x2, k, 0x00);
215
27.1k
      x7 = _mm_clmulepi64_si128(x3, k, 0x00);
216
27.1k
      x0 = _mm_clmulepi64_si128(x0, k, 0x11);
217
27.1k
      x1 = _mm_clmulepi64_si128(x1, k, 0x11);
218
27.1k
      x2 = _mm_clmulepi64_si128(x2, k, 0x11);
219
27.1k
      x3 = _mm_clmulepi64_si128(x3, k, 0x11);
220
27.1k
      x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
221
27.1k
      x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
222
27.1k
      x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
223
27.1k
      x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
224
27.1k
      x0 = _mm_xor_si128(x0, x4);
225
27.1k
      x1 = _mm_xor_si128(x1, x5);
226
27.1k
      x2 = _mm_xor_si128(x2, x6);
227
27.1k
      x3 = _mm_xor_si128(x3, x7);
228
27.1k
      x0 = _mm_xor_si128(x0, x8);
229
27.1k
      x1 = _mm_xor_si128(x1, x9);
230
27.1k
      x2 = _mm_xor_si128(x2, x10);
231
27.1k
      x3 = _mm_xor_si128(x3, x11);
232
233
27.1k
      p += CRC32_FOLDING_BLOCK_SIZE * 4;
234
27.1k
      nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
235
27.1k
    }
236
237
19
    k = _mm_loadu_si128((__m128i *)consts->k3k4);
238
    /* fold 4 to 1, [x1, x2, x3] -> x0 */
239
19
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
240
19
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
241
19
    x0 = _mm_xor_si128(x0, x1);
242
19
    x0 = _mm_xor_si128(x0, x4);
243
19
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
244
19
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
245
19
    x0 = _mm_xor_si128(x0, x2);
246
19
    x0 = _mm_xor_si128(x0, x4);
247
19
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
248
19
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
249
19
    x0 = _mm_xor_si128(x0, x3);
250
19
    x0 = _mm_xor_si128(x0, x4);
251
19
  }
252
253
23
  k = _mm_loadu_si128((__m128i *)consts->k3k4);
254
  /* folding by 1 */
255
36
  while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
256
    /* load next to x2, fold to x0, x1 */
257
13
    x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
258
13
    x1 = _mm_clmulepi64_si128(x0, k, 0x00);
259
13
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
260
13
    x0 = _mm_xor_si128(x0, x2);
261
13
    x0 = _mm_xor_si128(x0, x1);
262
13
    p += CRC32_FOLDING_BLOCK_SIZE;
263
13
    nr -= CRC32_FOLDING_BLOCK_SIZE;
264
13
  }
265
266
  /* reduce 128-bits(final fold) to 96-bits */
267
23
  x1 = _mm_clmulepi64_si128(x0, k, 0x10);
268
23
  x0 = _mm_srli_si128(x0, 8);
269
23
  x0 = _mm_xor_si128(x0, x1);
270
  /* reduce 96-bits to 64-bits */
271
23
  x1 = _mm_shuffle_epi32(x0, 0xfc);
272
23
  x0 = _mm_shuffle_epi32(x0, 0xf9);
273
23
  k = _mm_loadu_si128((__m128i*)consts->k5k6);
274
23
  x1 = _mm_clmulepi64_si128(x1, k, 0x00);
275
23
  x0 = _mm_xor_si128(x0, x1);
276
277
  /* barrett reduction */
278
23
  x1 = _mm_shuffle_epi32(x0, 0xf3);
279
23
  x0 = _mm_slli_si128(x0, 4);
280
23
  k = _mm_loadu_si128((__m128i*)consts->uPx);
281
23
  x1 = _mm_clmulepi64_si128(x1, k, 0x00);
282
23
  x1 = _mm_clmulepi64_si128(x1, k, 0x10);
283
23
  x0 = _mm_xor_si128(x1, x0);
284
23
  *crc =  _mm_extract_epi32(x0, 2);
285
23
  return (nr_in - nr); /* the nr processed */
286
32
}
287
288
# if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE)
289
size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
290
# else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
291
size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
292
# endif
293
67
{
294
67
  if (type > X86_CRC32_MAX) {
295
0
    return 0;
296
0
  }
297
67
  const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
298
299
67
  switch (type) {
300
35
  case X86_CRC32:
301
35
    return crc32_pclmul_batch(crc, p, nr, consts);
302
9
  case X86_CRC32B:
303
32
  case X86_CRC32C:
304
32
    return crc32_pclmul_reflected_batch(crc, p, nr, consts);
305
0
  default:
306
0
    return 0;
307
67
  }
308
67
}
309
#endif
310
311
#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
312
static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
313
0
{
314
0
  return 0;
315
0
}
316
317
# ifdef ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
318
size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
319
320
typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
321
322
ZEND_NO_SANITIZE_ADDRESS
323
ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
324
2
static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(void) {
325
2
  if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
326
2
    return crc32_sse42_pclmul_update;
327
2
  }
328
0
  return crc32_x86_simd_update_default;
329
2
}
330
# else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
331
static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
332
333
size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
334
  return crc32_x86_simd_ptr(type, crc, p, nr);
335
}
336
337
/* {{{ PHP_MINIT_FUNCTION */
338
PHP_MINIT_FUNCTION(crc32_x86_intrin)
339
{
340
  if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
341
    crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
342
  }
343
  return SUCCESS;
344
}
345
/* }}} */
346
# endif
347
#endif