Coverage Report

Created: 2026-05-30 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
Line
Count
Source
1
/* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding
2
 * approach with the PCLMULQDQ and VPCMULQDQ instructions.
3
 *
4
 * A white paper describing this algorithm can be found at:
5
 *     doc/crc-pclmulqdq.pdf
6
 *
7
 * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support)
8
 * Copyright (C) 2013 Intel Corporation. All rights reserved.
9
 * Copyright (C) 2016 Marian Beermann (support for initial value)
10
 * Authors:
11
 *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
12
 *     Jim Guilford    <james.guilford@intel.com>
13
 *     Vinodh Gopal    <vinodh.gopal@intel.com>
14
 *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
15
 *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
16
 *
17
 * For conditions of distribution and use, see copyright notice in zlib.h
18
 */
19
20
#include "zbuild.h"
21
22
#include <immintrin.h>
23
#include <wmmintrin.h>
24
#include <smmintrin.h> // _mm_extract_epi32
25
26
#include "crc32_braid_p.h"
27
#include "crc32_braid_tbl.h"
28
#include "crc32_p.h"
29
#include "x86_intrins.h"
30
31
/* 512-bit VPCLMULQDQ path requires AVX-512F */
32
#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
33
#  if defined(_MSC_VER) && _MSC_VER < 1920
34
     /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */
35
#    define z512_xor3_epi64(a, b, c)        _mm512_ternarylogic_epi32(a, b, c, 0x96)
36
#    define z512_inserti64x2(a, b, imm)     _mm512_inserti32x4(a, b, imm)
37
#    define z512_extracti64x2(a, imm)       _mm512_extracti32x4_epi32(a, imm)
38
#  else
39
0
#    define z512_xor3_epi64(a, b, c)        _mm512_ternarylogic_epi64(a, b, c, 0x96)
40
#    if defined(__AVX512DQ__)
41
#      if defined(_MSC_VER) && !defined(_MM_K0_REG8)
42
#        define z512_inserti64x2(a, b, imm) _mm512_maskz_inserti64x2(UINT8_MAX, a, b, imm)
43
#      else
44
0
#        define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm)
45
#      endif
46
0
#      define z512_extracti64x2(a, imm)     _mm512_extracti64x2_epi64(a, imm)
47
#    else
48
#      define z512_inserti64x2(a, b, imm)   _mm512_inserti32x4(a, b, imm)
49
#      define z512_extracti64x2(a, imm)     _mm512_extracti32x4_epi32(a, imm)
50
#    endif
51
#  endif
52
#  ifdef __AVX512VL__
53
0
#    define z128_xor3_epi64(a, b, c)        _mm_ternarylogic_epi64(a, b, c, 0x96)
54
#  endif
55
#endif
56
/* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */
57
#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
58
0
#  define z256_xor3_epi64(a, b, c)    _mm256_xor_si256(_mm256_xor_si256(a, b), c)
59
#endif
60
61
#ifndef z128_xor3_epi64
62
37.0M
#  define z128_xor3_epi64(a, b, c)    _mm_xor_si128(_mm_xor_si128(a, b), c)
63
#endif
64
65
static inline void fold_state_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3,
66
1.34M
                                const __m128i xmm_fold4) {
67
1.34M
    __m128i x_low  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
68
1.34M
    __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
69
70
1.34M
    *xmm_crc0 = *xmm_crc1;
71
1.34M
    *xmm_crc1 = *xmm_crc2;
72
1.34M
    *xmm_crc2 = *xmm_crc3;
73
1.34M
    *xmm_crc3 = _mm_xor_si128(x_low, x_high);
74
1.34M
}
crc32_pclmulqdq.c:fold_state_1
Line
Count
Source
66
1.34M
                                const __m128i xmm_fold4) {
67
1.34M
    __m128i x_low  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
68
1.34M
    __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
69
70
1.34M
    *xmm_crc0 = *xmm_crc1;
71
1.34M
    *xmm_crc1 = *xmm_crc2;
72
1.34M
    *xmm_crc2 = *xmm_crc3;
73
1.34M
    *xmm_crc3 = _mm_xor_si128(x_low, x_high);
74
1.34M
}
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_1
Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_1
75
76
static inline void fold_state_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3,
77
194k
                                const __m128i xmm_fold4) {
78
194k
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
79
194k
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
80
194k
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
81
194k
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
82
83
194k
    *xmm_crc0 = *xmm_crc2;
84
194k
    *xmm_crc1 = *xmm_crc3;
85
194k
    *xmm_crc2 = _mm_xor_si128(x_low0, x_high0);
86
194k
    *xmm_crc3 = _mm_xor_si128(x_low1, x_high1);
87
194k
}
crc32_pclmulqdq.c:fold_state_2
Line
Count
Source
77
194k
                                const __m128i xmm_fold4) {
78
194k
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
79
194k
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
80
194k
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
81
194k
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
82
83
194k
    *xmm_crc0 = *xmm_crc2;
84
194k
    *xmm_crc1 = *xmm_crc3;
85
194k
    *xmm_crc2 = _mm_xor_si128(x_low0, x_high0);
86
194k
    *xmm_crc3 = _mm_xor_si128(x_low1, x_high1);
87
194k
}
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_2
Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_2
88
89
static inline void fold_state_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3,
90
96.1k
                                const __m128i xmm_fold4) {
91
96.1k
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
92
96.1k
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
93
96.1k
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
94
96.1k
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
95
96.1k
    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
96
96.1k
    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
97
98
96.1k
    *xmm_crc0 = *xmm_crc3;
99
96.1k
    *xmm_crc1 = _mm_xor_si128(x_low0, x_high0);
100
96.1k
    *xmm_crc2 = _mm_xor_si128(x_low1, x_high1);
101
96.1k
    *xmm_crc3 = _mm_xor_si128(x_low2, x_high2);
102
96.1k
}
crc32_pclmulqdq.c:fold_state_3
Line
Count
Source
90
96.1k
                                const __m128i xmm_fold4) {
91
96.1k
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
92
96.1k
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
93
96.1k
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
94
96.1k
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
95
96.1k
    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
96
96.1k
    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
97
98
96.1k
    *xmm_crc0 = *xmm_crc3;
99
96.1k
    *xmm_crc1 = _mm_xor_si128(x_low0, x_high0);
100
96.1k
    *xmm_crc2 = _mm_xor_si128(x_low1, x_high1);
101
96.1k
    *xmm_crc3 = _mm_xor_si128(x_low2, x_high2);
102
96.1k
}
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_3
Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_3
103
104
static inline void fold_state_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3,
105
14.8M
                                const __m128i xmm_fold4) {
106
14.8M
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
107
14.8M
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
108
14.8M
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
109
14.8M
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
110
14.8M
    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
111
14.8M
    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
112
14.8M
    __m128i x_low3  = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
113
14.8M
    __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
114
115
14.8M
    *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
116
14.8M
    *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
117
14.8M
    *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
118
14.8M
    *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
119
14.8M
}
crc32_pclmulqdq.c:fold_state_4
Line
Count
Source
105
14.8M
                                const __m128i xmm_fold4) {
106
14.8M
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
107
14.8M
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10);
108
14.8M
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
109
14.8M
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
110
14.8M
    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
111
14.8M
    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
112
14.8M
    __m128i x_low3  = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
113
14.8M
    __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
114
115
14.8M
    *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
116
14.8M
    *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
117
14.8M
    *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
118
14.8M
    *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
119
14.8M
}
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_4
Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_4
120
121
967k
static inline void fold_state_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
122
967k
    const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
123
967k
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01);
124
967k
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10);
125
967k
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01);
126
967k
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10);
127
967k
    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01);
128
967k
    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10);
129
967k
    __m128i x_low3  = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01);
130
967k
    __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10);
131
132
967k
    *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
133
967k
    *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
134
967k
    *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
135
967k
    *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
136
967k
}
crc32_pclmulqdq.c:fold_state_12
Line
Count
Source
121
967k
static inline void fold_state_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
122
967k
    const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85);
123
967k
    __m128i x_low0  = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01);
124
967k
    __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10);
125
967k
    __m128i x_low1  = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01);
126
967k
    __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10);
127
967k
    __m128i x_low2  = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01);
128
967k
    __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10);
129
967k
    __m128i x_low3  = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01);
130
967k
    __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10);
131
132
967k
    *xmm_crc0 = _mm_xor_si128(x_low0, x_high0);
133
967k
    *xmm_crc1 = _mm_xor_si128(x_low1, x_high1);
134
967k
    *xmm_crc2 = _mm_xor_si128(x_low2, x_high2);
135
967k
    *xmm_crc3 = _mm_xor_si128(x_low3, x_high3);
136
967k
}
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_12
Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_12
137
138
/* 512-bit fold function requires AVX-512F */
139
#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
140
static inline void fold_state_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3,
141
                                 const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2,
142
0
                                 const __m512i zmm_t3, const __m512i zmm_fold16) {
143
0
    __m512i z_low0  = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01);
144
0
    __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10);
145
0
    __m512i z_low1  = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01);
146
0
    __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10);
147
0
    __m512i z_low2  = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01);
148
0
    __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10);
149
0
    __m512i z_low3  = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01);
150
0
    __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10);
151
152
0
    *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0);
153
0
    *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1);
154
0
    *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2);
155
0
    *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3);
156
0
}
157
158
static inline void fold_block_16(const uint8_t **src, uint8_t **dst, size_t *len, __m128i *xmm_crc0,
159
0
                                 __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const int COPY) {
160
0
    *len -= 256;
161
162
0
    __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
163
0
    __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
164
0
    __m512i z_low0, z_high0;
165
0
    const __m512i zmm_fold4 = _mm512_set4_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
166
0
    const __m512i zmm_fold16 = _mm512_set4_epi32(0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
167
168
0
    zmm_crc0 = _mm512_loadu_si512((__m512i *)*src);
169
0
    zmm_crc1 = _mm512_loadu_si512((__m512i *)*src + 1);
170
0
    zmm_crc2 = _mm512_loadu_si512((__m512i *)*src + 2);
171
0
    zmm_crc3 = _mm512_loadu_si512((__m512i *)*src + 3);
172
0
    *src += 256;
173
0
    if (COPY) {
174
0
        _mm512_storeu_si512((__m512i *)*dst, zmm_crc0);
175
0
        _mm512_storeu_si512((__m512i *)*dst + 1, zmm_crc1);
176
0
        _mm512_storeu_si512((__m512i *)*dst + 2, zmm_crc2);
177
0
        _mm512_storeu_si512((__m512i *)*dst + 3, zmm_crc3);
178
0
        *dst += 256;
179
0
    }
180
181
    // Fold existing xmm state into first 64 bytes
182
0
    zmm_t0 = _mm512_castsi128_si512(*xmm_crc0);
183
0
    zmm_t0 = z512_inserti64x2(zmm_t0, *xmm_crc1, 1);
184
0
    zmm_t0 = z512_inserti64x2(zmm_t0, *xmm_crc2, 2);
185
0
    zmm_t0 = z512_inserti64x2(zmm_t0, *xmm_crc3, 3);
186
187
0
    z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01);
188
0
    z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10);
189
0
    zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0);
190
191
0
    while (*len >= 256) {
192
0
        *len -= 256;
193
0
        zmm_t0 = _mm512_loadu_si512((__m512i *)*src);
194
0
        zmm_t1 = _mm512_loadu_si512((__m512i *)*src + 1);
195
0
        zmm_t2 = _mm512_loadu_si512((__m512i *)*src + 2);
196
0
        zmm_t3 = _mm512_loadu_si512((__m512i *)*src + 3);
197
0
        *src += 256;
198
199
0
        fold_state_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16);
200
0
        if (COPY) {
201
0
            _mm512_storeu_si512((__m512i *)*dst, zmm_t0);
202
0
            _mm512_storeu_si512((__m512i *)*dst + 1, zmm_t1);
203
0
            _mm512_storeu_si512((__m512i *)*dst + 2, zmm_t2);
204
0
            _mm512_storeu_si512((__m512i *)*dst + 3, zmm_t3);
205
0
            *dst += 256;
206
0
        }
207
0
    }
208
209
    // zmm_crc[0,1,2,3] -> zmm_crc0
210
0
    z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
211
0
    z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
212
0
    zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1);
213
214
0
    z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
215
0
    z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
216
0
    zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2);
217
218
0
    z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
219
0
    z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
220
0
    zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3);
221
222
    // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
223
0
    *xmm_crc0 = z512_extracti64x2(zmm_crc0, 0);
224
0
    *xmm_crc1 = z512_extracti64x2(zmm_crc0, 1);
225
0
    *xmm_crc2 = z512_extracti64x2(zmm_crc0, 2);
226
0
    *xmm_crc3 = z512_extracti64x2(zmm_crc0, 3);
227
0
}
228
#endif
229
/* 256-bit fold function for VPCLMULQDQ without AVX-512 */
230
#if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__)
231
static inline void fold_state_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3,
232
                                const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2,
233
0
                                const __m256i ymm_t3, const __m256i ymm_fold8) {
234
0
    __m256i y_low0  = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01);
235
0
    __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10);
236
0
    __m256i y_low1  = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01);
237
0
    __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10);
238
0
    __m256i y_low2  = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01);
239
0
    __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10);
240
0
    __m256i y_low3  = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01);
241
0
    __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10);
242
243
0
    *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0);
244
0
    *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1);
245
0
    *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2);
246
0
    *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3);
247
0
}
248
249
static inline void fold_block_8(const uint8_t **src, uint8_t **dst, size_t *len, __m128i *xmm_crc0,
250
                                __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3,
251
0
                                const __m128i xmm_fold4, const int COPY) {
252
0
    *len -= 128;
253
254
0
    __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3;
255
0
    __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3;
256
0
    __m256i y_low0, y_high0;
257
0
    const __m256i ymm_fold4 = _mm256_set_epi32(
258
0
        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596,
259
0
        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
260
0
    const __m256i ymm_fold8 = _mm256_set_epi32(
261
0
        0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880,
262
0
        0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880);
263
264
0
    ymm_crc0 = _mm256_loadu_si256((__m256i *)*src);
265
0
    ymm_crc1 = _mm256_loadu_si256((__m256i *)*src + 1);
266
0
    ymm_crc2 = _mm256_loadu_si256((__m256i *)*src + 2);
267
0
    ymm_crc3 = _mm256_loadu_si256((__m256i *)*src + 3);
268
0
    *src += 128;
269
0
    if (COPY) {
270
0
        _mm256_storeu_si256((__m256i *)*dst, ymm_crc0);
271
0
        _mm256_storeu_si256((__m256i *)*dst + 1, ymm_crc1);
272
0
        _mm256_storeu_si256((__m256i *)*dst + 2, ymm_crc2);
273
0
        _mm256_storeu_si256((__m256i *)*dst + 3, ymm_crc3);
274
0
        *dst += 128;
275
0
    }
276
277
    // Fold existing xmm state into first 32 bytes
278
0
    ymm_t0 = _mm256_castsi128_si256(*xmm_crc0);
279
0
    ymm_t0 = _mm256_inserti128_si256(ymm_t0, *xmm_crc1, 1);
280
281
0
    y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
282
0
    y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
283
0
    ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0);
284
285
0
    ymm_t0 = _mm256_castsi128_si256(*xmm_crc2);
286
0
    ymm_t0 = _mm256_inserti128_si256(ymm_t0, *xmm_crc3, 1);
287
288
0
    y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01);
289
0
    y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10);
290
0
    ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0);
291
292
0
    while (*len >= 128) {
293
0
        *len -= 128;
294
0
        ymm_t0 = _mm256_loadu_si256((__m256i *)*src);
295
0
        ymm_t1 = _mm256_loadu_si256((__m256i *)*src + 1);
296
0
        ymm_t2 = _mm256_loadu_si256((__m256i *)*src + 2);
297
0
        ymm_t3 = _mm256_loadu_si256((__m256i *)*src + 3);
298
0
        *src += 128;
299
300
0
        fold_state_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8);
301
0
        if (COPY) {
302
0
            _mm256_storeu_si256((__m256i *)*dst, ymm_t0);
303
0
            _mm256_storeu_si256((__m256i *)*dst + 1, ymm_t1);
304
0
            _mm256_storeu_si256((__m256i *)*dst + 2, ymm_t2);
305
0
            _mm256_storeu_si256((__m256i *)*dst + 3, ymm_t3);
306
0
            *dst += 128;
307
0
        }
308
0
    }
309
310
    // Extract 8 x 128-bit lanes from 4 x 256-bit registers
311
0
    __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0);
312
0
    __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1);
313
0
    __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1);
314
0
    __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1);
315
0
    __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2);
316
0
    __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1);
317
0
    __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3);
318
0
    __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1);
319
320
    // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4)
321
0
    __m128i x_low, x_high;
322
0
    x_low  = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01);
323
0
    x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10);
324
0
    *xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4);
325
326
0
    x_low  = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01);
327
0
    x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10);
328
0
    *xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5);
329
330
0
    x_low  = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01);
331
0
    x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10);
332
0
    *xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6);
333
334
0
    x_low  = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01);
335
0
    x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10);
336
0
    *xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7);
337
0
}
338
#endif
339
340
/* Chorba folding algorithm implemented from https://arxiv.org/abs/2412.16398
341
 * We interleave the PCLMUL-based folds with 8x scaled generator polynomial copies; we read
342
 * 8x QWORDS and then XOR them into the stream at the following offsets: 6, 9, 10, 16, 20, 22,
343
 * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper as "generator_64_bits_unrolled_8" */
344
#if !defined(X86_VPCLMULQDQ)
345
static inline void fold_block_chorba(const uint8_t **src, uint8_t **dst, size_t *len, __m128i *xmm_crc0,
346
                                     __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3,
347
23.6k
                                     const __m128i xmm_fold4, const int COPY) {
348
23.6k
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
349
350
23.6k
#ifndef __AVX512VL__
351
23.6k
    if (!COPY)
352
5.69k
        return;
353
17.9k
#endif
354
985k
    while (*len >= 512 + 64 + (16 * 8)) {
355
967k
        __m128i chorba8 = _mm_load_si128((__m128i *)*src);
356
967k
        __m128i chorba7 = _mm_load_si128((__m128i *)*src + 1);
357
967k
        __m128i chorba6 = _mm_load_si128((__m128i *)*src + 2);
358
967k
        __m128i chorba5 = _mm_load_si128((__m128i *)*src + 3);
359
967k
        __m128i chorba4 = _mm_load_si128((__m128i *)*src + 4);
360
967k
        __m128i chorba3 = _mm_load_si128((__m128i *)*src + 5);
361
967k
        __m128i chorba2 = _mm_load_si128((__m128i *)*src + 6);
362
967k
        __m128i chorba1 = _mm_load_si128((__m128i *)*src + 7);
363
967k
        if (COPY) {
364
967k
            _mm_storeu_si128((__m128i *)*dst, chorba8);
365
967k
            _mm_storeu_si128((__m128i *)*dst + 1, chorba7);
366
967k
            _mm_storeu_si128((__m128i *)*dst + 2, chorba6);
367
967k
            _mm_storeu_si128((__m128i *)*dst + 3, chorba5);
368
967k
            _mm_storeu_si128((__m128i *)*dst + 4, chorba4);
369
967k
            _mm_storeu_si128((__m128i *)*dst + 5, chorba3);
370
967k
            _mm_storeu_si128((__m128i *)*dst + 6, chorba2);
371
967k
            _mm_storeu_si128((__m128i *)*dst + 7, chorba1);
372
967k
            *dst += 16 * 8;
373
967k
        }
374
375
967k
        chorba2 = _mm_xor_si128(chorba2, chorba8);
376
967k
        chorba1 = _mm_xor_si128(chorba1, chorba7);
377
967k
        *src += 16 * 8;
378
967k
        *len -= 16 * 8;
379
380
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src);
381
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 1);
382
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 2);
383
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 3);
384
385
967k
        fold_state_12(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
386
967k
        if (COPY) {
387
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
388
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
389
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
390
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
391
967k
            *dst += 64;
392
967k
        }
393
394
967k
        *xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, *xmm_crc0);
395
967k
        *xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), *xmm_crc1);
396
967k
        *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, *xmm_crc2);
397
967k
        *xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, *xmm_crc3);
398
399
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 4);
400
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 5);
401
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 6);
402
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 7);
403
404
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
405
967k
        if (COPY) {
406
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
407
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
408
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
409
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
410
967k
            *dst += 64;
411
967k
        }
412
413
967k
        *xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, *xmm_crc0);
414
967k
        *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, *xmm_crc1);
415
967k
        *xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), *xmm_crc2);
416
967k
        *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), *xmm_crc3);
417
418
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 8);
419
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 9);
420
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 10);
421
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 11);
422
423
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
424
967k
        if (COPY) {
425
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
426
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
427
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
428
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
429
967k
            *dst += 64;
430
967k
        }
431
432
967k
        *xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, *xmm_crc0);
433
967k
        *xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), *xmm_crc1);
434
967k
        *xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, *xmm_crc2);
435
967k
        *xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, *xmm_crc3);
436
437
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 12);
438
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 13);
439
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 14);
440
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 15);
441
442
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
443
967k
        if (COPY) {
444
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
445
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
446
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
447
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
448
967k
            *dst += 64;
449
967k
        }
450
451
967k
        *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), *xmm_crc0);
452
967k
        *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, *xmm_crc1);
453
967k
        *xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), *xmm_crc2);
454
967k
        *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), *xmm_crc3);
455
456
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 16);
457
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 17);
458
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 18);
459
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 19);
460
461
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
462
967k
        if (COPY) {
463
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
464
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
465
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
466
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
467
967k
            *dst += 64;
468
967k
        }
469
470
967k
        *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), *xmm_crc0);
471
967k
        *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, *xmm_crc1);
472
967k
        *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, *xmm_crc2);
473
967k
        *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), *xmm_crc3);
474
475
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 20);
476
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 21);
477
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 22);
478
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 23);
479
480
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
481
967k
        if (COPY) {
482
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
483
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
484
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
485
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
486
967k
            *dst += 64;
487
967k
        }
488
489
967k
        *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), *xmm_crc0);
490
967k
        *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, *xmm_crc1);
491
967k
        *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, *xmm_crc2);
492
967k
        *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), *xmm_crc3);
493
494
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 24);
495
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 25);
496
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 26);
497
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 27);
498
499
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
500
967k
        if (COPY) {
501
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
502
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
503
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
504
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
505
967k
            *dst += 64;
506
967k
        }
507
508
967k
        *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), *xmm_crc0);
509
967k
        *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, *xmm_crc1);
510
967k
        *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, *xmm_crc2);
511
967k
        *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), *xmm_crc3);
512
513
967k
        xmm_t0 = _mm_load_si128((__m128i *)*src + 28);
514
967k
        xmm_t1 = _mm_load_si128((__m128i *)*src + 29);
515
967k
        xmm_t2 = _mm_load_si128((__m128i *)*src + 30);
516
967k
        xmm_t3 = _mm_load_si128((__m128i *)*src + 31);
517
518
967k
        fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4);
519
967k
        if (COPY) {
520
967k
            _mm_storeu_si128((__m128i *)*dst, xmm_t0);
521
967k
            _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1);
522
967k
            _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2);
523
967k
            _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3);
524
967k
            *dst += 64;
525
967k
        }
526
527
967k
        *xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, *xmm_crc0);
528
967k
        *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, *xmm_crc1);
529
967k
        *xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), *xmm_crc2);
530
967k
        *xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, *xmm_crc3);
531
532
967k
        *len -= 512;
533
967k
        *src += 512;
534
967k
    }
535
17.9k
}
536
#endif
537
538
Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len,
539
7.06M
                                              const int COPY) {
540
7.06M
    size_t copy_len = len;
541
7.06M
    if (len >= 16) {
542
        /* Calculate 16-byte alignment offset */
543
1.37M
        uintptr_t align_diff = ALIGN_DIFF(src, 16);
544
545
        /* If total length is less than (alignment bytes + 16), use the faster small method.
546
         * Handles both initially small buffers and cases where alignment would leave < 16 bytes */
547
1.37M
        copy_len = len < align_diff + 16 ? len : align_diff;
548
1.37M
    }
549
550
7.06M
    if (copy_len > 0) {
551
6.60M
        crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY);
552
6.60M
        src += copy_len;
553
6.60M
        len -= copy_len;
554
6.60M
        if (COPY) {
555
4.17k
            dst += copy_len;
556
4.17k
        }
557
6.60M
    }
558
559
7.06M
    if (len == 0)
560
5.72M
        return crc;
561
562
1.34M
    const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
563
564
1.34M
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
565
1.34M
    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
566
1.34M
    __m128i xmm_crc1 = _mm_setzero_si128();
567
1.34M
    __m128i xmm_crc2 = _mm_setzero_si128();
568
1.34M
    __m128i xmm_crc3 = _mm_setzero_si128();
569
570
1.34M
    if (crc != 0) {
571
        // Process the first 16 bytes and handle initial CRC
572
1.11M
        len -= 16;
573
1.11M
        xmm_t0 = _mm_load_si128((__m128i *)src);
574
1.11M
        src += 16;
575
576
1.11M
        fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
577
1.11M
        if (COPY) {
578
15.2k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
579
15.2k
            dst += 16;
580
15.2k
        }
581
1.11M
        xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc));
582
1.11M
    }
583
584
#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
585
    /* 512-bit VPCLMULQDQ path requires AVX-512F */
586
0
    if (len >= 256)
587
0
        fold_block_16(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, COPY);
588
#elif defined(X86_VPCLMULQDQ)
589
    /* 256-bit VPCLMULQDQ path (doesn't require AVX-512F) */
590
0
    if (len >= 128)
591
0
        fold_block_8(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY);
592
#else
593
    /* Chorba algorithm for PCLMULQDQ path (when VPCLMULQDQ not available) */
594
1.34M
    if (len >= 512 + 64 + (16 * 8))
595
23.6k
        fold_block_chorba(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY);
596
#endif  /* X86_VPCLMULQDQ */
597
598
9.45M
    while (len >= 64) {
599
8.11M
        len -= 64;
600
8.11M
        xmm_t0 = _mm_load_si128((__m128i *)src);
601
8.11M
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
602
8.11M
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
603
8.11M
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
604
8.11M
        src += 64;
605
606
8.11M
        fold_state_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
607
8.11M
        if (COPY) {
608
49.8k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
609
49.8k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
610
49.8k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
611
49.8k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
612
49.8k
            dst += 64;
613
49.8k
        }
614
615
8.11M
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
616
8.11M
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
617
8.11M
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
618
8.11M
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
619
8.11M
    }
620
621
    /*
622
     * len = num bytes left - 64
623
     */
624
1.34M
    if (len >= 48) {
625
96.1k
        len -= 48;
626
627
96.1k
        xmm_t0 = _mm_load_si128((__m128i *)src);
628
96.1k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
629
96.1k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
630
96.1k
        src += 48;
631
632
96.1k
        fold_state_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
633
96.1k
        if (COPY) {
634
13.2k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
635
13.2k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
636
13.2k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
637
13.2k
            dst += 48;
638
13.2k
        }
639
640
96.1k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
641
96.1k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
642
96.1k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
643
1.24M
    } else if (len >= 32) {
644
194k
        len -= 32;
645
646
194k
        xmm_t0 = _mm_load_si128((__m128i *)src);
647
194k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
648
194k
        src += 32;
649
650
194k
        fold_state_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
651
194k
        if (COPY) {
652
4.49k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
653
4.49k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
654
4.49k
            dst += 32;
655
4.49k
        }
656
657
194k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
658
194k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
659
1.05M
    } else if (len >= 16) {
660
231k
        len -= 16;
661
231k
        xmm_t0 = _mm_load_si128((__m128i *)src);
662
231k
        src += 16;
663
664
231k
        fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
665
231k
        if (COPY) {
666
3.09k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
667
3.09k
            dst += 16;
668
3.09k
        }
669
670
231k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
671
231k
    }
672
673
1.34M
    const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
674
1.34M
    const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641);
675
676
    /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */
677
1.34M
    __m128i x_low0  = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01);
678
1.34M
    __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10);
679
1.34M
    xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0);
680
681
1.34M
    __m128i x_low1  = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01);
682
1.34M
    __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10);
683
1.34M
    xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1);
684
685
1.34M
    __m128i x_low2  = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01);
686
1.34M
    __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10);
687
1.34M
    xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2);
688
689
    /* Fold remaining bytes into the 128-bit state */
690
1.34M
    if (len) {
691
904k
        const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
692
904k
        const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
693
694
        /* Create masks to shift bytes for partial input */
695
904k
        __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16));
696
904k
        __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3);
697
698
        /* Shift out bytes from crc3 to make space for new data */
699
904k
        __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl);
700
904k
        xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr);
701
702
        /* Insert the partial input into crc3 */
703
#if defined(__AVX512BW__) && defined(__AVX512VL__)
704
        __mmask16 k = (1 << len) - 1;
705
        __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src);
706
0
        if (COPY) {
707
0
            _mm_mask_storeu_epi8(dst, k, xmm_crc_part);
708
0
        }
709
#else
710
        __m128i xmm_crc_part = _mm_setzero_si128();
711
        memcpy(&xmm_crc_part, src, len);
712
904k
        if (COPY) {
713
11.6k
            memcpy(dst, src, len);
714
11.6k
        }
715
#endif
716
904k
        __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl);
717
904k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned);
718
719
        /* Fold the bytes that were shifted out back into crc3 */
720
904k
        __m128i ovf_low  = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01);
721
904k
        __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10);
722
904k
        xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high);
723
904k
    }
724
725
    /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */
726
1.34M
    __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00);
727
1.34M
    __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10);
728
729
1.34M
    x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf);
730
1.34M
    x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3);
731
732
1.34M
    __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01);
733
1.34M
    __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10);
734
735
1.34M
    crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2));
736
737
1.34M
    return ~crc;
738
7.06M
}
crc32_pclmulqdq.c:crc32_copy_impl
Line
Count
Source
539
7.06M
                                              const int COPY) {
540
7.06M
    size_t copy_len = len;
541
7.06M
    if (len >= 16) {
542
        /* Calculate 16-byte alignment offset */
543
1.37M
        uintptr_t align_diff = ALIGN_DIFF(src, 16);
544
545
        /* If total length is less than (alignment bytes + 16), use the faster small method.
546
         * Handles both initially small buffers and cases where alignment would leave < 16 bytes */
547
1.37M
        copy_len = len < align_diff + 16 ? len : align_diff;
548
1.37M
    }
549
550
7.06M
    if (copy_len > 0) {
551
6.60M
        crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY);
552
6.60M
        src += copy_len;
553
6.60M
        len -= copy_len;
554
6.60M
        if (COPY) {
555
4.17k
            dst += copy_len;
556
4.17k
        }
557
6.60M
    }
558
559
7.06M
    if (len == 0)
560
5.72M
        return crc;
561
562
1.34M
    const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
563
564
1.34M
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
565
1.34M
    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
566
1.34M
    __m128i xmm_crc1 = _mm_setzero_si128();
567
1.34M
    __m128i xmm_crc2 = _mm_setzero_si128();
568
1.34M
    __m128i xmm_crc3 = _mm_setzero_si128();
569
570
1.34M
    if (crc != 0) {
571
        // Process the first 16 bytes and handle initial CRC
572
1.11M
        len -= 16;
573
1.11M
        xmm_t0 = _mm_load_si128((__m128i *)src);
574
1.11M
        src += 16;
575
576
1.11M
        fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
577
1.11M
        if (COPY) {
578
15.2k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
579
15.2k
            dst += 16;
580
15.2k
        }
581
1.11M
        xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc));
582
1.11M
    }
583
584
#if defined(X86_VPCLMULQDQ) && defined(__AVX512F__)
585
    /* 512-bit VPCLMULQDQ path requires AVX-512F */
586
    if (len >= 256)
587
        fold_block_16(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, COPY);
588
#elif defined(X86_VPCLMULQDQ)
589
    /* 256-bit VPCLMULQDQ path (doesn't require AVX-512F) */
590
    if (len >= 128)
591
        fold_block_8(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY);
592
#else
593
    /* Chorba algorithm for PCLMULQDQ path (when VPCLMULQDQ not available) */
594
1.34M
    if (len >= 512 + 64 + (16 * 8))
595
23.6k
        fold_block_chorba(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY);
596
1.34M
#endif  /* X86_VPCLMULQDQ */
597
598
9.45M
    while (len >= 64) {
599
8.11M
        len -= 64;
600
8.11M
        xmm_t0 = _mm_load_si128((__m128i *)src);
601
8.11M
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
602
8.11M
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
603
8.11M
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
604
8.11M
        src += 64;
605
606
8.11M
        fold_state_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
607
8.11M
        if (COPY) {
608
49.8k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
609
49.8k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
610
49.8k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
611
49.8k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
612
49.8k
            dst += 64;
613
49.8k
        }
614
615
8.11M
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
616
8.11M
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
617
8.11M
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
618
8.11M
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
619
8.11M
    }
620
621
    /*
622
     * len = num bytes left - 64
623
     */
624
1.34M
    if (len >= 48) {
625
96.1k
        len -= 48;
626
627
96.1k
        xmm_t0 = _mm_load_si128((__m128i *)src);
628
96.1k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
629
96.1k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
630
96.1k
        src += 48;
631
632
96.1k
        fold_state_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
633
96.1k
        if (COPY) {
634
13.2k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
635
13.2k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
636
13.2k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
637
13.2k
            dst += 48;
638
13.2k
        }
639
640
96.1k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
641
96.1k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
642
96.1k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
643
1.24M
    } else if (len >= 32) {
644
194k
        len -= 32;
645
646
194k
        xmm_t0 = _mm_load_si128((__m128i *)src);
647
194k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
648
194k
        src += 32;
649
650
194k
        fold_state_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
651
194k
        if (COPY) {
652
4.49k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
653
4.49k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
654
4.49k
            dst += 32;
655
4.49k
        }
656
657
194k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
658
194k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
659
1.05M
    } else if (len >= 16) {
660
231k
        len -= 16;
661
231k
        xmm_t0 = _mm_load_si128((__m128i *)src);
662
231k
        src += 16;
663
664
231k
        fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4);
665
231k
        if (COPY) {
666
3.09k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
667
3.09k
            dst += 16;
668
3.09k
        }
669
670
231k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
671
231k
    }
672
673
1.34M
    const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e);
674
1.34M
    const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641);
675
676
    /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */
677
1.34M
    __m128i x_low0  = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01);
678
1.34M
    __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10);
679
1.34M
    xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0);
680
681
1.34M
    __m128i x_low1  = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01);
682
1.34M
    __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10);
683
1.34M
    xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1);
684
685
1.34M
    __m128i x_low2  = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01);
686
1.34M
    __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10);
687
1.34M
    xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2);
688
689
    /* Fold remaining bytes into the 128-bit state */
690
1.34M
    if (len) {
691
904k
        const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
692
904k
        const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
693
694
        /* Create masks to shift bytes for partial input */
695
904k
        __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16));
696
904k
        __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3);
697
698
        /* Shift out bytes from crc3 to make space for new data */
699
904k
        __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl);
700
904k
        xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr);
701
702
        /* Insert the partial input into crc3 */
703
#if defined(__AVX512BW__) && defined(__AVX512VL__)
704
        __mmask16 k = (1 << len) - 1;
705
        __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src);
706
        if (COPY) {
707
            _mm_mask_storeu_epi8(dst, k, xmm_crc_part);
708
        }
709
#else
710
904k
        __m128i xmm_crc_part = _mm_setzero_si128();
711
904k
        memcpy(&xmm_crc_part, src, len);
712
904k
        if (COPY) {
713
11.6k
            memcpy(dst, src, len);
714
11.6k
        }
715
904k
#endif
716
904k
        __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl);
717
904k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned);
718
719
        /* Fold the bytes that were shifted out back into crc3 */
720
904k
        __m128i ovf_low  = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01);
721
904k
        __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10);
722
904k
        xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high);
723
904k
    }
724
725
    /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */
726
1.34M
    __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00);
727
1.34M
    __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10);
728
729
1.34M
    x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf);
730
1.34M
    x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3);
731
732
1.34M
    __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01);
733
1.34M
    __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10);
734
735
1.34M
    crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2));
736
737
1.34M
    return ~crc;
738
7.06M
}
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:crc32_copy_impl
Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:crc32_copy_impl