Coverage Report

Created: 2025-10-10 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
Line
Count
Source
1
/*
2
 * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
3
 * instruction.
4
 *
5
 * A white paper describing this algorithm can be found at:
6
 *     doc/crc-pclmulqdq.pdf
7
 *
8
 * Copyright (C) 2013 Intel Corporation. All rights reserved.
9
 * Copyright (C) 2016 Marian Beermann (support for initial value)
10
 * Authors:
11
 *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
12
 *     Jim Guilford    <james.guilford@intel.com>
13
 *     Vinodh Gopal    <vinodh.gopal@intel.com>
14
 *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
15
 *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
16
 *
17
 * For conditions of distribution and use, see copyright notice in zlib.h
18
 */
19
20
#ifdef COPY
21
19.4k
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
22
#else
23
1.18M
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
24
1.18M
#endif
25
1.18M
    unsigned long algn_diff;
26
1.18M
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
27
1.18M
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
28
1.18M
    __m128i xmm_crc_part = _mm_setzero_si128();
29
1.18M
    char ALIGNED_(16) partial_buf[16] = { 0 };
30
#ifndef COPY
31
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
32
    int32_t first = init_crc != 0;
33
34
    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
35
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
36
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
37
     * up to 15 bytes + one full vector load. */
38
    assert(len >= 16 || first == 0);
39
#endif
40
1.18M
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
41
42
1.20M
    if (len < 16) {
43
1.65k
        if (len == 0)
44
0
            return;
45
46
1.65k
        memcpy(partial_buf, src, len);
47
1.65k
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
48
#ifdef COPY
49
        memcpy(dst, partial_buf, len);
50
#endif
51
1.65k
        goto partial;
52
1.65k
    }
53
54
1.20M
    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
55
1.20M
    if (algn_diff) {
56
989k
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
57
#ifdef COPY
58
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
59
        dst += algn_diff;
60
#else
61
987k
        XOR_INITIAL128(xmm_crc_part);
62
63
987k
        if (algn_diff < 4 && init_crc != 0) {
64
83.6k
            xmm_t0 = xmm_crc_part;
65
83.6k
            if (len >= 32) {
66
37.2k
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
67
37.2k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
68
37.2k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
69
46.3k
            } else {
70
46.3k
                memcpy(partial_buf, src + 16, len - 16);
71
46.3k
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
72
46.3k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
73
46.3k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
74
46.3k
                src += 16;
75
46.3k
                len -= 16;
76
#ifdef COPY
77
                dst -= algn_diff;
78
#endif
79
46.3k
                goto partial;
80
46.3k
            }
81
82
37.2k
            src += 16;
83
37.2k
            len -= 16;
84
37.2k
        }
85
940k
#endif
86
87
940k
        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
88
89
940k
        src += algn_diff;
90
940k
        len -= algn_diff;
91
940k
    }
92
93
#ifdef X86_VPCLMULQDQ
94
0
    if (len >= 256) {
95
#ifdef COPY
96
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
97
        dst += n;
98
#else
99
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
100
            xmm_initial, first);
101
        first = 0;
102
#endif
103
0
        len -= n;
104
0
        src += n;
105
0
    }
106
#endif
107
108
0
#ifndef WITHOUT_CHORBA
109
        /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
110
         * We interleave the PCLMUL-base folds with 8x scaled generator
111
         * polynomial copies; we read 8x QWORDS and then XOR them into
112
         * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
113
         * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
114
         * as "generator_64_bits_unrolled_8" */
115
2.64M
        while (len >= 512 + 64 + 16*8) {
116
1.48M
            __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
117
1.48M
            __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
118
1.48M
            __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
119
1.48M
            __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
120
1.48M
            __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
121
1.48M
            __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
122
1.48M
            __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
123
1.48M
            __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
124
#ifdef COPY
125
            _mm_storeu_si128((__m128i *)dst, chorba8);
126
            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
127
            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
128
            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
129
            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
130
            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
131
            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
132
            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
133
            dst += 16*8;
134
#else
135
667k
            XOR_INITIAL128(chorba8);
136
#endif
137
1.48M
            chorba2 = _mm_xor_si128(chorba2, chorba8);
138
1.48M
            chorba1 = _mm_xor_si128(chorba1, chorba7);
139
1.48M
            src += 16*8;
140
1.48M
            len -= 16*8;
141
142
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src);
143
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
144
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
145
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
146
147
1.48M
            fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
148
#ifdef COPY
149
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
150
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
151
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
152
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
153
            dst += 64;
154
#endif
155
1.48M
            xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
156
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
157
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
158
1.48M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
159
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
160
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
161
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
162
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
163
164
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
165
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
166
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
167
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
168
169
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
170
#ifdef COPY
171
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
172
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
173
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
174
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
175
            dst += 64;
176
#endif
177
178
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
179
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
180
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
181
1.48M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
182
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
183
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
184
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
185
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
186
187
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
188
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
189
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
190
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
191
192
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
193
#ifdef COPY
194
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
195
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
196
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
197
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
198
            dst += 64;
199
#endif
200
201
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
202
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
203
1.48M
            xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
204
1.48M
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
205
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
206
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
207
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
208
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
209
210
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
211
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
212
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
213
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
214
215
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
216
#ifdef COPY
217
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
218
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
219
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
220
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
221
            dst += 64;
222
#endif
223
224
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
225
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
226
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
227
1.48M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
228
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
229
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
230
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
231
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
232
233
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
234
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
235
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
236
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
237
238
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
239
#ifdef COPY
240
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
241
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
242
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
243
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
244
            dst += 64;
245
#endif
246
247
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
248
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
249
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
250
1.48M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
251
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
252
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
253
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
254
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
255
256
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
257
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
258
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
259
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
260
261
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
262
#ifdef COPY
263
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
264
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
265
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
266
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
267
            dst += 64;
268
#endif
269
270
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
271
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
272
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
273
1.48M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
274
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
275
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
276
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
277
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
278
279
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
280
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
281
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
282
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
283
284
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
285
#ifdef COPY
286
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
287
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
288
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
289
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
290
            dst += 64;
291
#endif
292
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
293
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
294
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
295
1.48M
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
296
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
297
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
298
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
299
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
300
301
1.48M
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
302
1.48M
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
303
1.48M
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
304
1.48M
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
305
306
1.48M
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
307
#ifdef COPY
308
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
309
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
310
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
311
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
312
            dst += 64;
313
#endif
314
1.48M
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
315
1.48M
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
316
1.48M
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
317
1.48M
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
318
1.48M
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
319
1.48M
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
320
1.48M
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
321
1.48M
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
322
323
1.48M
            len -= 512;
324
1.48M
            src += 512;
325
1.48M
        }
326
1.13M
#endif /* WITHOUT_CHORBA */
327
328
1.88M
    while (len >= 64) {
329
728k
        len -= 64;
330
728k
        xmm_t0 = _mm_load_si128((__m128i *)src);
331
728k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
332
728k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
333
728k
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
334
728k
        src += 64;
335
336
728k
        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
337
#ifdef COPY
338
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
339
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
340
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
341
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
342
        dst += 64;
343
#else
344
682k
        XOR_INITIAL128(xmm_t0);
345
#endif
346
347
728k
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
348
728k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
349
728k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
350
728k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
351
728k
    }
352
353
    /*
354
     * len = num bytes left - 64
355
     */
356
1.15M
    if (len >= 48) {
357
233k
        len -= 48;
358
359
233k
        xmm_t0 = _mm_load_si128((__m128i *)src);
360
233k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
361
233k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
362
233k
        src += 48;
363
#ifdef COPY
364
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
365
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
366
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
367
        dst += 48;
368
#else
369
230k
        XOR_INITIAL128(xmm_t0);
370
#endif
371
233k
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
372
373
233k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
374
233k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
375
233k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
376
925k
    } else if (len >= 32) {
377
136k
        len -= 32;
378
379
136k
        xmm_t0 = _mm_load_si128((__m128i *)src);
380
136k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
381
136k
        src += 32;
382
#ifdef COPY
383
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
385
        dst += 32;
386
#else
387
135k
        XOR_INITIAL128(xmm_t0);
388
#endif
389
136k
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
390
391
136k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
392
136k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
393
788k
    } else if (len >= 16) {
394
694k
        len -= 16;
395
694k
        xmm_t0 = _mm_load_si128((__m128i *)src);
396
694k
        src += 16;
397
#ifdef COPY
398
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
399
        dst += 16;
400
#else
401
692k
        XOR_INITIAL128(xmm_t0);
402
#endif
403
694k
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
404
405
694k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
406
694k
    }
407
408
1.20M
partial:
409
1.20M
    if (len) {
410
1.00M
        memcpy(&xmm_crc_part, src, len);
411
#ifdef COPY
412
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
413
        memcpy(dst, partial_buf, len);
414
#endif
415
1.00M
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
416
1.00M
    }
417
418
1.20M
    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419
1.20M
}
crc32_fold_pclmulqdq_copy
Line
Count
Source
21
19.4k
Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
22
#else
23
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
24
#endif
25
19.4k
    unsigned long algn_diff;
26
19.4k
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
27
19.4k
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
28
19.4k
    __m128i xmm_crc_part = _mm_setzero_si128();
29
19.4k
    char ALIGNED_(16) partial_buf[16] = { 0 };
30
#ifndef COPY
31
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
32
    int32_t first = init_crc != 0;
33
34
    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
35
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
36
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
37
     * up to 15 bytes + one full vector load. */
38
    assert(len >= 16 || first == 0);
39
#endif
40
19.4k
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
41
42
19.4k
    if (len < 16) {
43
857
        if (len == 0)
44
0
            return;
45
46
857
        memcpy(partial_buf, src, len);
47
857
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
48
857
#ifdef COPY
49
857
        memcpy(dst, partial_buf, len);
50
857
#endif
51
857
        goto partial;
52
857
    }
53
54
18.5k
    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
55
18.5k
    if (algn_diff) {
56
1.84k
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
57
1.84k
#ifdef COPY
58
1.84k
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
59
1.84k
        dst += algn_diff;
60
#else
61
        XOR_INITIAL128(xmm_crc_part);
62
63
        if (algn_diff < 4 && init_crc != 0) {
64
            xmm_t0 = xmm_crc_part;
65
            if (len >= 32) {
66
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
67
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
68
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
69
            } else {
70
                memcpy(partial_buf, src + 16, len - 16);
71
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
72
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
73
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
74
                src += 16;
75
                len -= 16;
76
#ifdef COPY
77
                dst -= algn_diff;
78
#endif
79
                goto partial;
80
            }
81
82
            src += 16;
83
            len -= 16;
84
        }
85
#endif
86
87
1.84k
        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
88
89
1.84k
        src += algn_diff;
90
1.84k
        len -= algn_diff;
91
1.84k
    }
92
93
#ifdef X86_VPCLMULQDQ
94
    if (len >= 256) {
95
#ifdef COPY
96
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
97
        dst += n;
98
#else
99
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
100
            xmm_initial, first);
101
        first = 0;
102
#endif
103
        len -= n;
104
        src += n;
105
    }
106
#endif
107
108
18.5k
#ifndef WITHOUT_CHORBA
109
        /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
110
         * We interleave the PCLMUL-base folds with 8x scaled generator
111
         * polynomial copies; we read 8x QWORDS and then XOR them into
112
         * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
113
         * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
114
         * as "generator_64_bits_unrolled_8" */
115
834k
        while (len >= 512 + 64 + 16*8) {
116
815k
            __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
117
815k
            __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
118
815k
            __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
119
815k
            __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
120
815k
            __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
121
815k
            __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
122
815k
            __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
123
815k
            __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
124
815k
#ifdef COPY
125
815k
            _mm_storeu_si128((__m128i *)dst, chorba8);
126
815k
            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
127
815k
            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
128
815k
            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
129
815k
            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
130
815k
            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
131
815k
            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
132
815k
            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
133
815k
            dst += 16*8;
134
#else
135
            XOR_INITIAL128(chorba8);
136
#endif
137
815k
            chorba2 = _mm_xor_si128(chorba2, chorba8);
138
815k
            chorba1 = _mm_xor_si128(chorba1, chorba7);
139
815k
            src += 16*8;
140
815k
            len -= 16*8;
141
142
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src);
143
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
144
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
145
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
146
147
815k
            fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
148
815k
#ifdef COPY
149
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
150
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
151
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
152
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
153
815k
            dst += 64;
154
815k
#endif
155
815k
            xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
156
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
157
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
158
815k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
159
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
160
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
161
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
162
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
163
164
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
165
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
166
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
167
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
168
169
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
170
815k
#ifdef COPY
171
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
172
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
173
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
174
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
175
815k
            dst += 64;
176
815k
#endif
177
178
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
179
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
180
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
181
815k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
182
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
183
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
184
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
185
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
186
187
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
188
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
189
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
190
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
191
192
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
193
815k
#ifdef COPY
194
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
195
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
196
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
197
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
198
815k
            dst += 64;
199
815k
#endif
200
201
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
202
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
203
815k
            xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
204
815k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
205
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
206
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
207
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
208
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
209
210
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
211
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
212
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
213
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
214
215
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
216
815k
#ifdef COPY
217
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
218
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
219
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
220
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
221
815k
            dst += 64;
222
815k
#endif
223
224
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
225
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
226
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
227
815k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
228
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
229
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
230
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
231
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
232
233
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
234
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
235
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
236
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
237
238
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
239
815k
#ifdef COPY
240
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
241
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
242
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
243
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
244
815k
            dst += 64;
245
815k
#endif
246
247
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
248
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
249
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
250
815k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
251
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
252
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
253
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
254
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
255
256
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
257
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
258
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
259
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
260
261
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
262
815k
#ifdef COPY
263
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
264
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
265
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
266
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
267
815k
            dst += 64;
268
815k
#endif
269
270
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
271
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
272
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
273
815k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
274
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
275
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
276
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
277
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
278
279
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
280
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
281
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
282
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
283
284
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
285
815k
#ifdef COPY
286
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
287
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
288
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
289
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
290
815k
            dst += 64;
291
815k
#endif
292
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
293
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
294
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
295
815k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
296
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
297
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
298
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
299
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
300
301
815k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
302
815k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
303
815k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
304
815k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
305
306
815k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
307
815k
#ifdef COPY
308
815k
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
309
815k
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
310
815k
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
311
815k
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
312
815k
            dst += 64;
313
815k
#endif
314
815k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
315
815k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
316
815k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
317
815k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
318
815k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
319
815k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
320
815k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
321
815k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
322
323
815k
            len -= 512;
324
815k
            src += 512;
325
815k
        }
326
18.5k
#endif /* WITHOUT_CHORBA */
327
328
65.0k
    while (len >= 64) {
329
46.4k
        len -= 64;
330
46.4k
        xmm_t0 = _mm_load_si128((__m128i *)src);
331
46.4k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
332
46.4k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
333
46.4k
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
334
46.4k
        src += 64;
335
336
46.4k
        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
337
46.4k
#ifdef COPY
338
46.4k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
339
46.4k
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
340
46.4k
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
341
46.4k
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
342
46.4k
        dst += 64;
343
#else
344
        XOR_INITIAL128(xmm_t0);
345
#endif
346
347
46.4k
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
348
46.4k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
349
46.4k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
350
46.4k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
351
46.4k
    }
352
353
    /*
354
     * len = num bytes left - 64
355
     */
356
18.5k
    if (len >= 48) {
357
2.62k
        len -= 48;
358
359
2.62k
        xmm_t0 = _mm_load_si128((__m128i *)src);
360
2.62k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
361
2.62k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
362
2.62k
        src += 48;
363
2.62k
#ifdef COPY
364
2.62k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
365
2.62k
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
366
2.62k
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
367
2.62k
        dst += 48;
368
#else
369
        XOR_INITIAL128(xmm_t0);
370
#endif
371
2.62k
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
372
373
2.62k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
374
2.62k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
375
2.62k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
376
15.9k
    } else if (len >= 32) {
377
1.38k
        len -= 32;
378
379
1.38k
        xmm_t0 = _mm_load_si128((__m128i *)src);
380
1.38k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
381
1.38k
        src += 32;
382
1.38k
#ifdef COPY
383
1.38k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
1.38k
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
385
1.38k
        dst += 32;
386
#else
387
        XOR_INITIAL128(xmm_t0);
388
#endif
389
1.38k
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
390
391
1.38k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
392
1.38k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
393
14.5k
    } else if (len >= 16) {
394
1.92k
        len -= 16;
395
1.92k
        xmm_t0 = _mm_load_si128((__m128i *)src);
396
1.92k
        src += 16;
397
1.92k
#ifdef COPY
398
1.92k
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
399
1.92k
        dst += 16;
400
#else
401
        XOR_INITIAL128(xmm_t0);
402
#endif
403
1.92k
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
404
405
1.92k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
406
1.92k
    }
407
408
19.4k
partial:
409
19.4k
    if (len) {
410
7.22k
        memcpy(&xmm_crc_part, src, len);
411
7.22k
#ifdef COPY
412
7.22k
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
413
7.22k
        memcpy(dst, partial_buf, len);
414
7.22k
#endif
415
7.22k
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
416
7.22k
    }
417
418
19.4k
    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419
19.4k
}
Unexecuted instantiation: crc32_fold_vpclmulqdq_copy
crc32_fold_pclmulqdq
Line
Count
Source
23
1.18M
Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
24
1.18M
#endif
25
1.18M
    unsigned long algn_diff;
26
1.18M
    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
27
1.18M
    __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
28
1.18M
    __m128i xmm_crc_part = _mm_setzero_si128();
29
1.18M
    char ALIGNED_(16) partial_buf[16] = { 0 };
30
1.18M
#ifndef COPY
31
1.18M
    __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
32
1.18M
    int32_t first = init_crc != 0;
33
34
    /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
35
     * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
36
     * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
37
     * up to 15 bytes + one full vector load. */
38
1.18M
    assert(len >= 16 || first == 0);
39
1.18M
#endif
40
1.18M
    crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
41
42
1.18M
    if (len < 16) {
43
798
        if (len == 0)
44
0
            return;
45
46
798
        memcpy(partial_buf, src, len);
47
798
        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
48
#ifdef COPY
49
        memcpy(dst, partial_buf, len);
50
#endif
51
798
        goto partial;
52
798
    }
53
54
1.18M
    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
55
1.18M
    if (algn_diff) {
56
987k
        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
57
#ifdef COPY
58
        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
59
        dst += algn_diff;
60
#else
61
987k
        XOR_INITIAL128(xmm_crc_part);
62
63
987k
        if (algn_diff < 4 && init_crc != 0) {
64
83.6k
            xmm_t0 = xmm_crc_part;
65
83.6k
            if (len >= 32) {
66
37.2k
                xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
67
37.2k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
68
37.2k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
69
46.3k
            } else {
70
46.3k
                memcpy(partial_buf, src + 16, len - 16);
71
46.3k
                xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
72
46.3k
                fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
73
46.3k
                xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
74
46.3k
                src += 16;
75
46.3k
                len -= 16;
76
#ifdef COPY
77
                dst -= algn_diff;
78
#endif
79
46.3k
                goto partial;
80
46.3k
            }
81
82
37.2k
            src += 16;
83
37.2k
            len -= 16;
84
37.2k
        }
85
940k
#endif
86
87
940k
        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
88
89
940k
        src += algn_diff;
90
940k
        len -= algn_diff;
91
940k
    }
92
93
#ifdef X86_VPCLMULQDQ
94
    if (len >= 256) {
95
#ifdef COPY
96
        size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
97
        dst += n;
98
#else
99
        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
100
            xmm_initial, first);
101
        first = 0;
102
#endif
103
        len -= n;
104
        src += n;
105
    }
106
#endif
107
108
1.13M
#ifndef WITHOUT_CHORBA
109
        /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
110
         * We interleave the PCLMUL-base folds with 8x scaled generator
111
         * polynomial copies; we read 8x QWORDS and then XOR them into
112
         * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
113
         * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
114
         * as "generator_64_bits_unrolled_8" */
115
1.80M
        while (len >= 512 + 64 + 16*8) {
116
667k
            __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
117
667k
            __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
118
667k
            __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
119
667k
            __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
120
667k
            __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
121
667k
            __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
122
667k
            __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
123
667k
            __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
124
#ifdef COPY
125
            _mm_storeu_si128((__m128i *)dst, chorba8);
126
            _mm_storeu_si128((__m128i *)dst + 1, chorba7);
127
            _mm_storeu_si128((__m128i *)dst + 2, chorba6);
128
            _mm_storeu_si128((__m128i *)dst + 3, chorba5);
129
            _mm_storeu_si128((__m128i *)dst + 4, chorba4);
130
            _mm_storeu_si128((__m128i *)dst + 5, chorba3);
131
            _mm_storeu_si128((__m128i *)dst + 6, chorba2);
132
            _mm_storeu_si128((__m128i *)dst + 7, chorba1);
133
            dst += 16*8;
134
#else
135
667k
            XOR_INITIAL128(chorba8);
136
667k
#endif
137
667k
            chorba2 = _mm_xor_si128(chorba2, chorba8);
138
667k
            chorba1 = _mm_xor_si128(chorba1, chorba7);
139
667k
            src += 16*8;
140
667k
            len -= 16*8;
141
142
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src);
143
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
144
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
145
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
146
147
667k
            fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
148
#ifdef COPY
149
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
150
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
151
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
152
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
153
            dst += 64;
154
#endif
155
667k
            xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
156
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
157
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
158
667k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
159
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
160
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
161
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
162
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
163
164
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
165
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
166
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
167
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
168
169
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
170
#ifdef COPY
171
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
172
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
173
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
174
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
175
            dst += 64;
176
#endif
177
178
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
179
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
180
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
181
667k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
182
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
183
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
184
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
185
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
186
187
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
188
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
189
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
190
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
191
192
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
193
#ifdef COPY
194
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
195
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
196
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
197
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
198
            dst += 64;
199
#endif
200
201
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
202
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
203
667k
            xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
204
667k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
205
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
206
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
207
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
208
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
209
210
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
211
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
212
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
213
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
214
215
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
216
#ifdef COPY
217
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
218
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
219
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
220
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
221
            dst += 64;
222
#endif
223
224
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
225
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
226
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
227
667k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
228
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
229
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
230
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
231
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
232
233
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
234
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
235
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
236
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
237
238
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
239
#ifdef COPY
240
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
241
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
242
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
243
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
244
            dst += 64;
245
#endif
246
247
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
248
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
249
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
250
667k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
251
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
252
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
253
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
254
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
255
256
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
257
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
258
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
259
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
260
261
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
262
#ifdef COPY
263
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
264
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
265
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
266
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
267
            dst += 64;
268
#endif
269
270
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
271
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
272
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
273
667k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
274
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
275
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
276
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
277
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
278
279
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
280
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
281
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
282
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
283
284
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
285
#ifdef COPY
286
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
287
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
288
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
289
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
290
            dst += 64;
291
#endif
292
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
293
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
294
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
295
667k
            xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
296
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
297
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
298
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
299
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
300
301
667k
            xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
302
667k
            xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
303
667k
            xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
304
667k
            xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
305
306
667k
            fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
307
#ifdef COPY
308
            _mm_storeu_si128((__m128i *)dst, xmm_t0);
309
            _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
310
            _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
311
            _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
312
            dst += 64;
313
#endif
314
667k
            xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
315
667k
            xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
316
667k
            xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
317
667k
            xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
318
667k
            xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
319
667k
            xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
320
667k
            xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
321
667k
            xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
322
323
667k
            len -= 512;
324
667k
            src += 512;
325
667k
        }
326
1.13M
#endif /* WITHOUT_CHORBA */
327
328
1.82M
    while (len >= 64) {
329
682k
        len -= 64;
330
682k
        xmm_t0 = _mm_load_si128((__m128i *)src);
331
682k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
332
682k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
333
682k
        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
334
682k
        src += 64;
335
336
682k
        fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
337
#ifdef COPY
338
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
339
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
340
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
341
        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
342
        dst += 64;
343
#else
344
682k
        XOR_INITIAL128(xmm_t0);
345
682k
#endif
346
347
682k
        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
348
682k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
349
682k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
350
682k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
351
682k
    }
352
353
    /*
354
     * len = num bytes left - 64
355
     */
356
1.13M
    if (len >= 48) {
357
230k
        len -= 48;
358
359
230k
        xmm_t0 = _mm_load_si128((__m128i *)src);
360
230k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
361
230k
        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
362
230k
        src += 48;
363
#ifdef COPY
364
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
365
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
366
        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
367
        dst += 48;
368
#else
369
230k
        XOR_INITIAL128(xmm_t0);
370
230k
#endif
371
230k
        fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
372
373
230k
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
374
230k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
375
230k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
376
909k
    } else if (len >= 32) {
377
135k
        len -= 32;
378
379
135k
        xmm_t0 = _mm_load_si128((__m128i *)src);
380
135k
        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
381
135k
        src += 32;
382
#ifdef COPY
383
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
385
        dst += 32;
386
#else
387
135k
        XOR_INITIAL128(xmm_t0);
388
135k
#endif
389
135k
        fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
390
391
135k
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
392
135k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
393
774k
    } else if (len >= 16) {
394
692k
        len -= 16;
395
692k
        xmm_t0 = _mm_load_si128((__m128i *)src);
396
692k
        src += 16;
397
#ifdef COPY
398
        _mm_storeu_si128((__m128i *)dst, xmm_t0);
399
        dst += 16;
400
#else
401
692k
        XOR_INITIAL128(xmm_t0);
402
692k
#endif
403
692k
        fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
404
405
692k
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
406
692k
    }
407
408
1.18M
partial:
409
1.18M
    if (len) {
410
995k
        memcpy(&xmm_crc_part, src, len);
411
#ifdef COPY
412
        _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
413
        memcpy(dst, partial_buf, len);
414
#endif
415
995k
        partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
416
995k
    }
417
418
1.18M
    crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419
1.18M
}
Unexecuted instantiation: crc32_fold_vpclmulqdq