Coverage Report

Created: 2025-08-29 06:28

/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
3
 * instruction.
4
 *
5
 * A white paper describing this algorithm can be found at:
6
 *     doc/crc-pclmulqdq.pdf
7
 *
8
 * Copyright (C) 2013 Intel Corporation. All rights reserved.
9
 * Copyright (C) 2016 Marian Beermann (support for initial value)
10
 * Authors:
11
 *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
12
 *     Jim Guilford    <james.guilford@intel.com>
13
 *     Vinodh Gopal    <vinodh.gopal@intel.com>
14
 *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
15
 *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
16
 *
17
 * For conditions of distribution and use, see copyright notice in zlib.h
18
 */
19
20
#ifdef COPY
21
16.8k
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
22
#else
23
1.43M
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
24
1.43M
#endif
25
1.43M
    unsigned long algn_diff;
26
1.43M
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
27
1.43M
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
28
1.43M
    __m128i xmm_crc_part = _mm_setzero_si128();
29
1.43M
    char ALIGNED_(16) partial_buf[16] = { 0 };
30
#ifndef COPY
31
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
32
    int32_t first = init_crc != 0;
33
34
    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
35
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
36
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
37
     * up to 15 bytes + one full vector load. */
38
    assert(len >= 16 || first == 0);
39
#endif
40
1.43M
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
41
42
1.45M
    if (len < 16) {
43
1.75k
        if (len == 0)
44
0
            return;
45
46
1.75k
        memcpy(partial_buf, src, len);
47
1.75k
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
48
#ifdef COPY
49
        memcpy(dst, partial_buf, len);
50
#endif
51
1.75k
        goto partial;
52
1.75k
    }
53
54
1.45M
    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
55
1.45M
    if (algn_diff) {
56
1.17M
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
57
#ifdef COPY
58
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
59
        dst += algn_diff;
60
#else
61
1.17M
        XOR_INITIAL128(xmm_crc_part);
62
63
1.17M
        if (algn_diff < 4 && init_crc != 0) {
64
104k
            xmm_t0 = xmm_crc_part;
65
104k
            if (len >= 32) {
66
39.6k
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
67
39.6k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
68
39.6k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
69
64.4k
            } else {
70
64.4k
                memcpy(partial_buf, src + 16, len - 16);
71
64.4k
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
72
64.4k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
73
64.4k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
74
64.4k
                src += 16;
75
64.4k
                len -= 16;
76
#ifdef COPY
77
                dst -= algn_diff;
78
#endif
79
64.4k
                goto partial;
80
64.4k
            }
81
82
39.6k
            src += 16;
83
39.6k
            len -= 16;
84
39.6k
        }
85
1.10M
#endif
86
87
1.10M
        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
88
89
1.10M
        src += algn_diff;
90
1.10M
        len -= algn_diff;
91
1.10M
    }
92
93
#ifdef X86_VPCLMULQDQ
94
0
    if (len >= 256) {
95
#ifdef COPY
96
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
97
        dst += n;
98
#else
99
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
100
            xmm_initial, first);
101
        first = 0;
102
#endif
103
0
        len -= n;
104
0
        src += n;
105
0
    }
106
#endif
107
108
0
#ifndef WITHOUT_CHORBA
109
        /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
110
         * We interleave the PCLMUL-base folds with 8x scaled generator
111
         * polynomial copies; we read 8x QWORDS and then XOR them into
112
         * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
113
         * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
114
         * as "generator_64_bits_unrolled_8" */
115
2.63M
        while (len >= 512 + 64 + 16*8) {
116
1.24M
            __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
117
1.24M
            __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
118
1.24M
            __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
119
1.24M
            __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
120
1.24M
            __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
121
1.24M
            __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
122
1.24M
            __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
123
1.24M
            __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
124
#ifdef COPY
125
            _mm_storeu_si128((__m128i *)dst, chorba8);
126
            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
127
            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
128
            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
129
            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
130
            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
131
            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
132
            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
133
            dst += 16*8;
134
#else
135
557k
            XOR_INITIAL128(chorba8);
136
#endif
137
1.24M
            chorba2 = _mm_xor_si128(chorba2, chorba8);
138
1.24M
            chorba1 = _mm_xor_si128(chorba1, chorba7);
139
1.24M
            src += 16*8;
140
1.24M
            len -= 16*8;
141
142
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src);
143
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
144
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
145
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
146
147
1.24M
            fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
148
#ifdef COPY
149
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
150
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
151
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
152
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
153
            dst += 64;
154
#endif
155
1.24M
            xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
156
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
157
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
158
1.24M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
159
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
160
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
161
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
162
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
163
164
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
165
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
166
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
167
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
168
169
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
170
#ifdef COPY
171
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
172
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
173
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
174
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
175
            dst += 64;
176
#endif
177
178
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
179
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
180
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
181
1.24M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
182
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
183
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
184
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
185
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
186
187
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
188
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
189
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
190
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
191
192
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
193
#ifdef COPY
194
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
195
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
196
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
197
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
198
            dst += 64;
199
#endif
200
201
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
202
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
203
1.24M
            xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
204
1.24M
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
205
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
206
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
207
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
208
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
209
210
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
211
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
212
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
213
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
214
215
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
216
#ifdef COPY
217
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
218
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
219
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
220
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
221
            dst += 64;
222
#endif
223
224
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
225
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
226
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
227
1.24M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
228
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
229
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
230
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
231
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
232
233
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
234
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
235
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
236
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
237
238
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
239
#ifdef COPY
240
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
241
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
242
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
243
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
244
            dst += 64;
245
#endif
246
247
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
248
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
249
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
250
1.24M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
251
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
252
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
253
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
254
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
255
256
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
257
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
258
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
259
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
260
261
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
262
#ifdef COPY
263
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
264
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
265
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
266
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
267
            dst += 64;
268
#endif
269
270
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
271
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
272
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
273
1.24M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
274
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
275
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
276
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
277
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
278
279
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
280
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
281
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
282
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
283
284
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
285
#ifdef COPY
286
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
287
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
288
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
289
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
290
            dst += 64;
291
#endif
292
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
293
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
294
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
295
1.24M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
296
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
297
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
298
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
299
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
300
301
1.24M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
302
1.24M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
303
1.24M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
304
1.24M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
305
306
1.24M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
307
#ifdef COPY
308
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
309
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
310
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
311
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
312
            dst += 64;
313
#endif
314
1.24M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
315
1.24M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
316
1.24M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
317
1.24M
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
318
1.24M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
319
1.24M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
320
1.24M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
321
1.24M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
322
323
1.24M
            len -= 512;
324
1.24M
            src += 512;
325
1.24M
        }
326
1.37M
#endif /* WITHOUT_CHORBA */
327
328
1.95M
    while (len >= 64) {
329
561k
        len -= 64;
330
561k
        xmm_t0 = _mm_load_si128((__m128i *)src);
331
561k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
332
561k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
333
561k
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
334
561k
        src += 64;
335
336
561k
        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
337
#ifdef COPY
338
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
339
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
340
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
341
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
342
        dst += 64;
343
#else
344
522k
        XOR_INITIAL128(xmm_t0);
345
#endif
346
347
561k
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
348
561k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
349
561k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
350
561k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
351
561k
    }
352
353
    /*
354
     * len = num bytes left - 64
355
     */
356
1.38M
    if (len >= 48) {
357
178k
        len -= 48;
358
359
178k
        xmm_t0 = _mm_load_si128((__m128i *)src);
360
178k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
361
178k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
362
178k
        src += 48;
363
#ifdef COPY
364
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
365
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
366
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
367
        dst += 48;
368
#else
369
176k
        XOR_INITIAL128(xmm_t0);
370
#endif
371
178k
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
372
373
178k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
374
178k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
375
178k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
376
1.20M
    } else if (len >= 32) {
377
195k
        len -= 32;
378
379
195k
        xmm_t0 = _mm_load_si128((__m128i *)src);
380
195k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
381
195k
        src += 32;
382
#ifdef COPY
383
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
385
        dst += 32;
386
#else
387
194k
        XOR_INITIAL128(xmm_t0);
388
#endif
389
195k
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
390
391
195k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
392
195k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
393
1.01M
    } else if (len >= 16) {
394
896k
        len -= 16;
395
896k
        xmm_t0 = _mm_load_si128((__m128i *)src);
396
896k
        src += 16;
397
#ifdef COPY
398
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
399
        dst += 16;
400
#else
401
894k
        XOR_INITIAL128(xmm_t0);
402
#endif
403
896k
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
404
405
896k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
406
896k
    }
407
408
1.45M
partial:
409
1.45M
    if (len) {
410
1.18M
        memcpy(&xmm_crc_part, src, len);
411
#ifdef COPY
412
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
413
        memcpy(dst, partial_buf, len);
414
#endif
415
1.18M
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
416
1.18M
    }
417
418
1.45M
    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419
1.45M
}
crc32_fold_pclmulqdq_copy
Line
Count
Source
21
16.8k
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
22
#else
23
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
24
#endif
25
16.8k
    unsigned long algn_diff;
26
16.8k
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
27
16.8k
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
28
16.8k
    __m128i xmm_crc_part = _mm_setzero_si128();
29
16.8k
    char ALIGNED_(16) partial_buf[16] = { 0 };
30
#ifndef COPY
31
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
32
    int32_t first = init_crc != 0;
33
34
    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
35
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
36
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
37
     * up to 15 bytes + one full vector load. */
38
    assert(len >= 16 || first == 0);
39
#endif
40
16.8k
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
41
42
16.8k
    if (len < 16) {
43
900
        if (len == 0)
44
0
            return;
45
46
900
        memcpy(partial_buf, src, len);
47
900
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
48
900
#ifdef COPY
49
900
        memcpy(dst, partial_buf, len);
50
900
#endif
51
900
        goto partial;
52
900
    }
53
54
15.9k
    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
55
15.9k
    if (algn_diff) {
56
1.78k
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
57
1.78k
#ifdef COPY
58
1.78k
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
59
1.78k
        dst += algn_diff;
60
#else
61
        XOR_INITIAL128(xmm_crc_part);
62
63
        if (algn_diff < 4 && init_crc != 0) {
64
            xmm_t0 = xmm_crc_part;
65
            if (len >= 32) {
66
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
67
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
68
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
69
            } else {
70
                memcpy(partial_buf, src + 16, len - 16);
71
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
72
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
73
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
74
                src += 16;
75
                len -= 16;
76
#ifdef COPY
77
                dst -= algn_diff;
78
#endif
79
                goto partial;
80
            }
81
82
            src += 16;
83
            len -= 16;
84
        }
85
#endif
86
87
1.78k
        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
88
89
1.78k
        src += algn_diff;
90
1.78k
        len -= algn_diff;
91
1.78k
    }
92
93
#ifdef X86_VPCLMULQDQ
94
    if (len >= 256) {
95
#ifdef COPY
96
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
97
        dst += n;
98
#else
99
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
100
            xmm_initial, first);
101
        first = 0;
102
#endif
103
        len -= n;
104
        src += n;
105
    }
106
#endif
107
108
15.9k
#ifndef WITHOUT_CHORBA
109
        /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
110
         * We interleave the PCLMUL-base folds with 8x scaled generator
111
         * polynomial copies; we read 8x QWORDS and then XOR them into
112
         * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
113
         * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
114
         * as "generator_64_bits_unrolled_8" */
115
700k
        while (len >= 512 + 64 + 16*8) {
116
684k
            __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
117
684k
            __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
118
684k
            __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
119
684k
            __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
120
684k
            __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
121
684k
            __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
122
684k
            __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
123
684k
            __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
124
684k
#ifdef COPY
125
684k
            _mm_storeu_si128((__m128i *)dst, chorba8);
126
684k
            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
127
684k
            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
128
684k
            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
129
684k
            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
130
684k
            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
131
684k
            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
132
684k
            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
133
684k
            dst += 16*8;
134
#else
135
            XOR_INITIAL128(chorba8);
136
#endif
137
684k
            chorba2 = _mm_xor_si128(chorba2, chorba8);
138
684k
            chorba1 = _mm_xor_si128(chorba1, chorba7);
139
684k
            src += 16*8;
140
684k
            len -= 16*8;
141
142
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src);
143
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
144
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
145
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
146
147
684k
            fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
148
684k
#ifdef COPY
149
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
150
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
151
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
152
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
153
684k
            dst += 64;
154
684k
#endif
155
684k
            xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
156
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
157
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
158
684k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
159
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
160
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
161
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
162
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
163
164
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
165
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
166
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
167
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
168
169
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
170
684k
#ifdef COPY
171
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
172
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
173
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
174
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
175
684k
            dst += 64;
176
684k
#endif
177
178
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
179
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
180
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
181
684k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
182
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
183
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
184
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
185
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
186
187
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
188
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
189
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
190
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
191
192
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
193
684k
#ifdef COPY
194
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
195
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
196
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
197
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
198
684k
            dst += 64;
199
684k
#endif
200
201
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
202
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
203
684k
            xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
204
684k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
205
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
206
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
207
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
208
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
209
210
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
211
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
212
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
213
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
214
215
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
216
684k
#ifdef COPY
217
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
218
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
219
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
220
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
221
684k
            dst += 64;
222
684k
#endif
223
224
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
225
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
226
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
227
684k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
228
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
229
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
230
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
231
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
232
233
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
234
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
235
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
236
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
237
238
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
239
684k
#ifdef COPY
240
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
241
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
242
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
243
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
244
684k
            dst += 64;
245
684k
#endif
246
247
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
248
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
249
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
250
684k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
251
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
252
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
253
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
254
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
255
256
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
257
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
258
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
259
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
260
261
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
262
684k
#ifdef COPY
263
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
264
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
265
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
266
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
267
684k
            dst += 64;
268
684k
#endif
269
270
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
271
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
272
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
273
684k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
274
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
275
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
276
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
277
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
278
279
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
280
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
281
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
282
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
283
284
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
285
684k
#ifdef COPY
286
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
287
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
288
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
289
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
290
684k
            dst += 64;
291
684k
#endif
292
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
293
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
294
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
295
684k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
296
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
297
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
298
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
299
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
300
301
684k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
302
684k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
303
684k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
304
684k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
305
306
684k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
307
684k
#ifdef COPY
308
684k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
309
684k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
310
684k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
311
684k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
312
684k
            dst += 64;
313
684k
#endif
314
684k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
315
684k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
316
684k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
317
684k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
318
684k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
319
684k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
320
684k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
321
684k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
322
323
684k
            len -= 512;
324
684k
            src += 512;
325
684k
        }
326
15.9k
#endif /* WITHOUT_CHORBA */
327
328
54.4k
    while (len >= 64) {
329
38.4k
        len -= 64;
330
38.4k
        xmm_t0 = _mm_load_si128((__m128i *)src);
331
38.4k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
332
38.4k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
333
38.4k
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
334
38.4k
        src += 64;
335
336
38.4k
        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
337
38.4k
#ifdef COPY
338
38.4k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
339
38.4k
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
340
38.4k
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
341
38.4k
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
342
38.4k
        dst += 64;
343
#else
344
        XOR_INITIAL128(xmm_t0);
345
#endif
346
347
38.4k
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
348
38.4k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
349
38.4k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
350
38.4k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
351
38.4k
    }
352
353
    /*
354
     * len = num bytes left - 64
355
     */
356
15.9k
    if (len >= 48) {
357
2.41k
        len -= 48;
358
359
2.41k
        xmm_t0 = _mm_load_si128((__m128i *)src);
360
2.41k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
361
2.41k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
362
2.41k
        src += 48;
363
2.41k
#ifdef COPY
364
2.41k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
365
2.41k
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
366
2.41k
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
367
2.41k
        dst += 48;
368
#else
369
        XOR_INITIAL128(xmm_t0);
370
#endif
371
2.41k
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
372
373
2.41k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
374
2.41k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
375
2.41k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
376
13.5k
    } else if (len >= 32) {
377
1.27k
        len -= 32;
378
379
1.27k
        xmm_t0 = _mm_load_si128((__m128i *)src);
380
1.27k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
381
1.27k
        src += 32;
382
1.27k
#ifdef COPY
383
1.27k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
1.27k
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
385
1.27k
        dst += 32;
386
#else
387
        XOR_INITIAL128(xmm_t0);
388
#endif
389
1.27k
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
390
391
1.27k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
392
1.27k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
393
12.2k
    } else if (len >= 16) {
394
1.81k
        len -= 16;
395
1.81k
        xmm_t0 = _mm_load_si128((__m128i *)src);
396
1.81k
        src += 16;
397
1.81k
#ifdef COPY
398
1.81k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
399
1.81k
        dst += 16;
400
#else
401
        XOR_INITIAL128(xmm_t0);
402
#endif
403
1.81k
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
404
405
1.81k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
406
1.81k
    }
407
408
16.8k
partial:
409
16.8k
    if (len) {
410
6.67k
        memcpy(&xmm_crc_part, src, len);
411
6.67k
#ifdef COPY
412
6.67k
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
413
6.67k
        memcpy(dst, partial_buf, len);
414
6.67k
#endif
415
6.67k
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
416
6.67k
    }
417
418
16.8k
    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419
16.8k
}
Unexecuted instantiation: crc32_fold_vpclmulqdq_copy
crc32_fold_pclmulqdq
Line
Count
Source
23
1.43M
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
24
1.43M
#endif
25
1.43M
    unsigned long algn_diff;
26
1.43M
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
27
1.43M
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
28
1.43M
    __m128i xmm_crc_part = _mm_setzero_si128();
29
1.43M
    char ALIGNED_(16) partial_buf[16] = { 0 };
30
1.43M
#ifndef COPY
31
1.43M
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
32
1.43M
    int32_t first = init_crc != 0;
33
34
    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
35
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
36
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
37
     * up to 15 bytes + one full vector load. */
38
1.43M
    assert(len >= 16 || first == 0);
39
1.43M
#endif
40
1.43M
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
41
42
1.43M
    if (len < 16) {
43
853
        if (len == 0)
44
0
            return;
45
46
853
        memcpy(partial_buf, src, len);
47
853
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
48
#ifdef COPY
49
        memcpy(dst, partial_buf, len);
50
#endif
51
853
        goto partial;
52
853
    }
53
54
1.43M
    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
55
1.43M
    if (algn_diff) {
56
1.17M
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
57
#ifdef COPY
58
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
59
        dst += algn_diff;
60
#else
61
1.17M
        XOR_INITIAL128(xmm_crc_part);
62
63
1.17M
        if (algn_diff < 4 && init_crc != 0) {
64
104k
            xmm_t0 = xmm_crc_part;
65
104k
            if (len >= 32) {
66
39.6k
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
67
39.6k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
68
39.6k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
69
64.4k
            } else {
70
64.4k
                memcpy(partial_buf, src + 16, len - 16);
71
64.4k
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
72
64.4k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
73
64.4k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
74
64.4k
                src += 16;
75
64.4k
                len -= 16;
76
#ifdef COPY
77
                dst -= algn_diff;
78
#endif
79
64.4k
                goto partial;
80
64.4k
            }
81
82
39.6k
            src += 16;
83
39.6k
            len -= 16;
84
39.6k
        }
85
1.10M
#endif
86
87
1.10M
        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
88
89
1.10M
        src += algn_diff;
90
1.10M
        len -= algn_diff;
91
1.10M
    }
92
93
#ifdef X86_VPCLMULQDQ
94
    if (len >= 256) {
95
#ifdef COPY
96
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
97
        dst += n;
98
#else
99
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
100
            xmm_initial, first);
101
        first = 0;
102
#endif
103
        len -= n;
104
        src += n;
105
    }
106
#endif
107
108
1.37M
#ifndef WITHOUT_CHORBA
109
        /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
110
         * We interleave the PCLMUL-base folds with 8x scaled generator
111
         * polynomial copies; we read 8x QWORDS and then XOR them into
112
         * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
113
         * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
114
         * as "generator_64_bits_unrolled_8" */
115
1.93M
        while (len >= 512 + 64 + 16*8) {
116
557k
            __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
117
557k
            __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
118
557k
            __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
119
557k
            __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
120
557k
            __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
121
557k
            __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
122
557k
            __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
123
557k
            __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
124
#ifdef COPY
125
            _mm_storeu_si128((__m128i *)dst, chorba8);
126
            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
127
            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
128
            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
129
            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
130
            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
131
            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
132
            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
133
            dst += 16*8;
134
#else
135
557k
            XOR_INITIAL128(chorba8);
136
557k
#endif
137
557k
            chorba2 = _mm_xor_si128(chorba2, chorba8);
138
557k
            chorba1 = _mm_xor_si128(chorba1, chorba7);
139
557k
            src += 16*8;
140
557k
            len -= 16*8;
141
142
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src);
143
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
144
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
145
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
146
147
557k
            fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
148
#ifdef COPY
149
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
150
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
151
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
152
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
153
            dst += 64;
154
#endif
155
557k
            xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
156
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
157
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
158
557k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
159
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
160
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
161
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
162
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
163
164
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
165
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
166
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
167
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
168
169
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
170
#ifdef COPY
171
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
172
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
173
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
174
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
175
            dst += 64;
176
#endif
177
178
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
179
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
180
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
181
557k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
182
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
183
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
184
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
185
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
186
187
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
188
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
189
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
190
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
191
192
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
193
#ifdef COPY
194
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
195
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
196
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
197
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
198
            dst += 64;
199
#endif
200
201
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
202
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
203
557k
            xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
204
557k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
205
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
206
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
207
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
208
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
209
210
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
211
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
212
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
213
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
214
215
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
216
#ifdef COPY
217
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
218
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
219
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
220
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
221
            dst += 64;
222
#endif
223
224
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
225
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
226
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
227
557k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
228
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
229
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
230
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
231
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
232
233
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
234
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
235
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
236
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
237
238
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
239
#ifdef COPY
240
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
241
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
242
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
243
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
244
            dst += 64;
245
#endif
246
247
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
248
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
249
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
250
557k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
251
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
252
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
253
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
254
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
255
256
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
257
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
258
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
259
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
260
261
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
262
#ifdef COPY
263
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
264
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
265
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
266
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
267
            dst += 64;
268
#endif
269
270
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
271
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
272
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
273
557k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
274
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
275
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
276
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
277
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
278
279
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
280
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
281
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
282
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
283
284
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
285
#ifdef COPY
286
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
287
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
288
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
289
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
290
            dst += 64;
291
#endif
292
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
293
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
294
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
295
557k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
296
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
297
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
298
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
299
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
300
301
557k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
302
557k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
303
557k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
304
557k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
305
306
557k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
307
#ifdef COPY
308
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
309
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
310
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
311
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
312
            dst += 64;
313
#endif
314
557k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
315
557k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
316
557k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
317
557k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
318
557k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
319
557k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
320
557k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
321
557k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
322
323
557k
            len -= 512;
324
557k
            src += 512;
325
557k
        }
326
1.37M
#endif /* WITHOUT_CHORBA */
327
328
1.89M
    while (len >= 64) {
329
522k
        len -= 64;
330
522k
        xmm_t0 = _mm_load_si128((__m128i *)src);
331
522k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
332
522k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
333
522k
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
334
522k
        src += 64;
335
336
522k
        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
337
#ifdef COPY
338
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
339
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
340
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
341
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
342
        dst += 64;
343
#else
344
522k
        XOR_INITIAL128(xmm_t0);
345
522k
#endif
346
347
522k
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
348
522k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
349
522k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
350
522k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
351
522k
    }
352
353
    /*
354
     * len = num bytes left - 64
355
     */
356
1.37M
    if (len >= 48) {
357
176k
        len -= 48;
358
359
176k
        xmm_t0 = _mm_load_si128((__m128i *)src);
360
176k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
361
176k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
362
176k
        src += 48;
363
#ifdef COPY
364
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
365
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
366
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
367
        dst += 48;
368
#else
369
176k
        XOR_INITIAL128(xmm_t0);
370
176k
#endif
371
176k
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
372
373
176k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
374
176k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
375
176k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
376
1.19M
    } else if (len >= 32) {
377
194k
        len -= 32;
378
379
194k
        xmm_t0 = _mm_load_si128((__m128i *)src);
380
194k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
381
194k
        src += 32;
382
#ifdef COPY
383
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
385
        dst += 32;
386
#else
387
194k
        XOR_INITIAL128(xmm_t0);
388
194k
#endif
389
194k
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
390
391
194k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
392
194k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
393
1.00M
    } else if (len >= 16) {
394
894k
        len -= 16;
395
894k
        xmm_t0 = _mm_load_si128((__m128i *)src);
396
894k
        src += 16;
397
#ifdef COPY
398
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
399
        dst += 16;
400
#else
401
894k
        XOR_INITIAL128(xmm_t0);
402
894k
#endif
403
894k
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
404
405
894k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
406
894k
    }
407
408
1.43M
partial:
409
1.43M
    if (len) {
410
1.17M
        memcpy(&xmm_crc_part, src, len);
411
#ifdef COPY
412
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
413
        memcpy(dst, partial_buf, len);
414
#endif
415
1.17M
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
416
1.17M
    }
417
418
1.43M
    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419
1.43M
}
Unexecuted instantiation: crc32_fold_vpclmulqdq