Coverage Report

Created: 2025-12-31 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/standard/crc32_x86.c
Line
Count
Source
1
/*
2
  +----------------------------------------------------------------------+
3
  | Copyright (c) The PHP Group                                          |
4
  +----------------------------------------------------------------------+
5
  | This source file is subject to version 3.01 of the PHP license,      |
6
  | that is bundled with this package in the file LICENSE, and is        |
7
  | available through the world-wide-web at the following url:           |
8
  | https://www.php.net/license/3_01.txt                                 |
9
  | If you did not receive a copy of the PHP license and are unable to   |
10
  | obtain it through the world-wide-web, please send a note to          |
11
  | license@php.net so we can mail you a copy immediately.               |
12
  +----------------------------------------------------------------------+
13
  | Author: Frank Du <frank.du@intel.com>                                |
14
  +----------------------------------------------------------------------+
15
  | Compute the crc32 of the buffer. Based on:                           |
16
  | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ"       |
17
  |  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0          |
18
*/
19
20
#include "crc32_x86.h"
21
22
#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
23
# include <nmmintrin.h>
24
# include <wmmintrin.h>
25
#endif
26
27
#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
28
# include "Zend/zend_cpuinfo.h"
29
#endif
30
31
#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
32
33
typedef struct _crc32_pclmul_bit_consts {
34
  uint64_t k1k2[2];
35
  uint64_t k3k4[2];
36
  uint64_t k5k6[2];
37
  uint64_t uPx[2];
38
} crc32_pclmul_consts;
39
40
static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
41
  { /* X86_CRC32, polynomial: 0x04C11DB7 */
42
    {0x00e6228b11, 0x008833794c}, /* endianness swap */
43
    {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
44
    {0x00490d678d, 0x00f200aa66}, /* endianness swap */
45
    {0x0104d101df, 0x0104c11db7}
46
  },
47
  { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
48
    {0x0154442bd4, 0x01c6e41596},
49
    {0x01751997d0, 0x00ccaa009e},
50
    {0x0163cd6124, 0x01db710640},
51
    {0x01f7011641, 0x01db710641},
52
  },
53
  { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
54
    {0x00740eef02, 0x009e4addf8},
55
    {0x00f20c0dfe, 0x014cd00bd6},
56
    {0x00dd45aab8, 0x0000000000},
57
    {0x00dea713f1, 0x0105ec76f0}
58
  }
59
};
60
61
static uint8_t pclmul_shuf_mask_table[16] = {
62
  0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
63
  0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
64
};
65
66
/* Folding of 128-bit data chunks */
67
24.8k
#define CRC32_FOLDING_BLOCK_SIZE (16)
68
69
/* PCLMUL version of non-reflected crc32 */
70
ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
71
size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
72
35
{
73
35
  size_t nr_in = nr;
74
35
  __m128i x0, x1, x2, k, shuf_mask;
75
76
35
  if (nr < CRC32_FOLDING_BLOCK_SIZE) {
77
6
    return 0;
78
6
  }
79
80
29
  shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
81
29
  x0 = _mm_cvtsi32_si128(*crc);
82
29
  x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
83
29
  x0 = _mm_slli_si128(x0, 12);
84
29
  x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
85
29
  x0 = _mm_xor_si128(x1, x0);
86
29
  p += CRC32_FOLDING_BLOCK_SIZE;
87
29
  nr -= CRC32_FOLDING_BLOCK_SIZE;
88
89
29
  if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
90
25
    __m128i x3, x4;
91
92
25
    x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
93
25
    x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
94
25
    x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
95
25
    x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
96
25
    x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
97
25
    x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
98
25
    p += CRC32_FOLDING_BLOCK_SIZE * 3;
99
25
    nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
100
101
25
    k = _mm_loadu_si128((__m128i *)consts->k1k2);
102
    /* parallel folding by 4 */
103
3.19k
    while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
104
3.16k
      __m128i x5, x6, x7, x8, x9, x10, x11;
105
3.16k
      x4 = _mm_clmulepi64_si128(x0, k, 0x00);
106
3.16k
      x5 = _mm_clmulepi64_si128(x1, k, 0x00);
107
3.16k
      x6 = _mm_clmulepi64_si128(x2, k, 0x00);
108
3.16k
      x7 = _mm_clmulepi64_si128(x3, k, 0x00);
109
3.16k
      x0 = _mm_clmulepi64_si128(x0, k, 0x11);
110
3.16k
      x1 = _mm_clmulepi64_si128(x1, k, 0x11);
111
3.16k
      x2 = _mm_clmulepi64_si128(x2, k, 0x11);
112
3.16k
      x3 = _mm_clmulepi64_si128(x3, k, 0x11);
113
3.16k
      x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
114
3.16k
      x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
115
3.16k
      x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
116
3.16k
      x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
117
3.16k
      x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
118
3.16k
      x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
119
3.16k
      x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
120
3.16k
      x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
121
3.16k
      x0 = _mm_xor_si128(x0, x4);
122
3.16k
      x1 = _mm_xor_si128(x1, x5);
123
3.16k
      x2 = _mm_xor_si128(x2, x6);
124
3.16k
      x3 = _mm_xor_si128(x3, x7);
125
3.16k
      x0 = _mm_xor_si128(x0, x8);
126
3.16k
      x1 = _mm_xor_si128(x1, x9);
127
3.16k
      x2 = _mm_xor_si128(x2, x10);
128
3.16k
      x3 = _mm_xor_si128(x3, x11);
129
130
3.16k
      p += CRC32_FOLDING_BLOCK_SIZE * 4;
131
3.16k
      nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
132
3.16k
    }
133
134
25
    k = _mm_loadu_si128((__m128i *)consts->k3k4);
135
    /* fold 4 to 1, [x1, x2, x3] -> x0 */
136
25
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
137
25
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
138
25
    x0 = _mm_xor_si128(x0, x1);
139
25
    x0 = _mm_xor_si128(x0, x4);
140
25
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
141
25
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
142
25
    x0 = _mm_xor_si128(x0, x2);
143
25
    x0 = _mm_xor_si128(x0, x4);
144
25
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
145
25
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
146
25
    x0 = _mm_xor_si128(x0, x3);
147
25
    x0 = _mm_xor_si128(x0, x4);
148
25
  }
149
150
29
  k = _mm_loadu_si128((__m128i *)consts->k3k4);
151
  /* folding by 1 */
152
61
  while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
153
    /* load next to x2, fold to x0, x1 */
154
32
    x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
155
32
    x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
156
32
    x1 = _mm_clmulepi64_si128(x0, k, 0x00);
157
32
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
158
32
    x0 = _mm_xor_si128(x0, x2);
159
32
    x0 = _mm_xor_si128(x0, x1);
160
32
    p += CRC32_FOLDING_BLOCK_SIZE;
161
32
    nr -= CRC32_FOLDING_BLOCK_SIZE;
162
32
  }
163
164
  /* reduce 128-bits(final fold) to 96-bits */
165
29
  k = _mm_loadu_si128((__m128i*)consts->k5k6);
166
29
  x1 = _mm_clmulepi64_si128(x0, k, 0x11);
167
29
  x0 = _mm_slli_si128(x0, 8);
168
29
  x0 = _mm_srli_si128(x0, 4);
169
29
  x0 = _mm_xor_si128(x0, x1);
170
  /* reduce 96-bits to 64-bits */
171
29
  x1 = _mm_clmulepi64_si128(x0, k, 0x01);
172
29
  x0 = _mm_xor_si128(x0, x1);
173
174
  /* barrett reduction */
175
29
  k = _mm_loadu_si128((__m128i*)consts->uPx);
176
29
  x1 = _mm_move_epi64(x0);
177
29
  x1 = _mm_srli_si128(x1, 4);
178
29
  x1 = _mm_clmulepi64_si128(x1, k, 0x00);
179
29
  x1 = _mm_srli_si128(x1, 4);
180
29
  x1 = _mm_clmulepi64_si128(x1, k, 0x10);
181
29
  x0 = _mm_xor_si128(x1, x0);
182
29
  *crc =  _mm_extract_epi32(x0, 0);
183
29
  return (nr_in - nr); /* the nr processed */
184
35
}
185
186
/* PCLMUL version of reflected crc32 */
187
ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
188
size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
189
58
{
190
58
  size_t nr_in = nr;
191
58
  __m128i x0, x1, x2, k;
192
193
58
  if (nr < CRC32_FOLDING_BLOCK_SIZE) {
194
8
    return 0;
195
8
  }
196
197
50
  x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
198
50
  x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
199
50
  p += CRC32_FOLDING_BLOCK_SIZE;
200
50
  nr -= CRC32_FOLDING_BLOCK_SIZE;
201
50
  if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
202
22
    __m128i x3, x4;
203
204
22
    x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
205
22
    x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
206
22
    x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
207
22
    p += CRC32_FOLDING_BLOCK_SIZE * 3;
208
22
    nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
209
210
22
    k = _mm_loadu_si128((__m128i *)consts->k1k2);
211
    /* parallel folding by 4 */
212
4.88k
    while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
213
4.85k
      __m128i x5, x6, x7, x8, x9, x10, x11;
214
4.85k
      x4 = _mm_clmulepi64_si128(x0, k, 0x00);
215
4.85k
      x5 = _mm_clmulepi64_si128(x1, k, 0x00);
216
4.85k
      x6 = _mm_clmulepi64_si128(x2, k, 0x00);
217
4.85k
      x7 = _mm_clmulepi64_si128(x3, k, 0x00);
218
4.85k
      x0 = _mm_clmulepi64_si128(x0, k, 0x11);
219
4.85k
      x1 = _mm_clmulepi64_si128(x1, k, 0x11);
220
4.85k
      x2 = _mm_clmulepi64_si128(x2, k, 0x11);
221
4.85k
      x3 = _mm_clmulepi64_si128(x3, k, 0x11);
222
4.85k
      x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
223
4.85k
      x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
224
4.85k
      x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
225
4.85k
      x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
226
4.85k
      x0 = _mm_xor_si128(x0, x4);
227
4.85k
      x1 = _mm_xor_si128(x1, x5);
228
4.85k
      x2 = _mm_xor_si128(x2, x6);
229
4.85k
      x3 = _mm_xor_si128(x3, x7);
230
4.85k
      x0 = _mm_xor_si128(x0, x8);
231
4.85k
      x1 = _mm_xor_si128(x1, x9);
232
4.85k
      x2 = _mm_xor_si128(x2, x10);
233
4.85k
      x3 = _mm_xor_si128(x3, x11);
234
235
4.85k
      p += CRC32_FOLDING_BLOCK_SIZE * 4;
236
4.85k
      nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
237
4.85k
    }
238
239
22
    k = _mm_loadu_si128((__m128i *)consts->k3k4);
240
    /* fold 4 to 1, [x1, x2, x3] -> x0 */
241
22
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
242
22
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
243
22
    x0 = _mm_xor_si128(x0, x1);
244
22
    x0 = _mm_xor_si128(x0, x4);
245
22
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
246
22
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
247
22
    x0 = _mm_xor_si128(x0, x2);
248
22
    x0 = _mm_xor_si128(x0, x4);
249
22
    x4 = _mm_clmulepi64_si128(x0, k, 0x00);
250
22
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
251
22
    x0 = _mm_xor_si128(x0, x3);
252
22
    x0 = _mm_xor_si128(x0, x4);
253
22
  }
254
255
50
  k = _mm_loadu_si128((__m128i *)consts->k3k4);
256
  /* folding by 1 */
257
101
  while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
258
    /* load next to x2, fold to x0, x1 */
259
51
    x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
260
51
    x1 = _mm_clmulepi64_si128(x0, k, 0x00);
261
51
    x0 = _mm_clmulepi64_si128(x0, k, 0x11);
262
51
    x0 = _mm_xor_si128(x0, x2);
263
51
    x0 = _mm_xor_si128(x0, x1);
264
51
    p += CRC32_FOLDING_BLOCK_SIZE;
265
51
    nr -= CRC32_FOLDING_BLOCK_SIZE;
266
51
  }
267
268
  /* reduce 128-bits(final fold) to 96-bits */
269
50
  x1 = _mm_clmulepi64_si128(x0, k, 0x10);
270
50
  x0 = _mm_srli_si128(x0, 8);
271
50
  x0 = _mm_xor_si128(x0, x1);
272
  /* reduce 96-bits to 64-bits */
273
50
  x1 = _mm_shuffle_epi32(x0, 0xfc);
274
50
  x0 = _mm_shuffle_epi32(x0, 0xf9);
275
50
  k = _mm_loadu_si128((__m128i*)consts->k5k6);
276
50
  x1 = _mm_clmulepi64_si128(x1, k, 0x00);
277
50
  x0 = _mm_xor_si128(x0, x1);
278
279
  /* barrett reduction */
280
50
  x1 = _mm_shuffle_epi32(x0, 0xf3);
281
50
  x0 = _mm_slli_si128(x0, 4);
282
50
  k = _mm_loadu_si128((__m128i*)consts->uPx);
283
50
  x1 = _mm_clmulepi64_si128(x1, k, 0x00);
284
50
  x1 = _mm_clmulepi64_si128(x1, k, 0x10);
285
50
  x0 = _mm_xor_si128(x1, x0);
286
50
  *crc =  _mm_extract_epi32(x0, 2);
287
50
  return (nr_in - nr); /* the nr processed */
288
58
}
289
290
# if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE)
291
size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
292
# else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
293
size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
294
# endif
295
93
{
296
93
  if (type > X86_CRC32_MAX) {
297
0
    return 0;
298
0
  }
299
93
  const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
300
301
93
  switch (type) {
302
35
  case X86_CRC32:
303
35
    return crc32_pclmul_batch(crc, p, nr, consts);
304
35
  case X86_CRC32B:
305
58
  case X86_CRC32C:
306
58
    return crc32_pclmul_reflected_batch(crc, p, nr, consts);
307
0
  default:
308
0
    return 0;
309
93
  }
310
93
}
311
#endif
312
313
#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
314
static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
315
0
{
316
0
  return 0;
317
0
}
318
319
# ifdef ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
320
size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
321
322
typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
323
324
ZEND_NO_SANITIZE_ADDRESS
325
ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
326
static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(void) {
327
  if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
328
    return crc32_sse42_pclmul_update;
329
  }
330
  return crc32_x86_simd_update_default;
331
}
332
# else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
333
static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
334
335
93
size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
336
93
  return crc32_x86_simd_ptr(type, crc, p, nr);
337
93
}
338
339
/* {{{ PHP_MINIT_FUNCTION */
340
PHP_MINIT_FUNCTION(crc32_x86_intrin)
341
14
{
342
14
  if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
343
14
    crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
344
14
  }
345
14
  return SUCCESS;
346
14
}
347
/* }}} */
348
# endif
349
#endif