Coverage Report

Created: 2026-02-14 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/crc32_chorba_sse41.c
Line
Count
Source
1
#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
2
3
#include "zbuild.h"
4
#include "crc32_braid_p.h"
5
#include "crc32_braid_tbl.h"
6
#include "crc32.h"
7
#include <emmintrin.h>
8
#include <smmintrin.h>
9
#include "arch/x86/x86_intrins.h"
10
#include "arch_functions.h"
11
12
0
#define READ_NEXT(in, off, a, b) do { \
13
0
        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
14
0
        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
15
0
        } while (0);
16
17
0
#define NEXT_ROUND(invec, a, b, c, d) do { \
18
0
        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
19
0
        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
20
0
        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
21
0
        d  = _mm_srli_epi64(invec, 20); \
22
0
        } while (0);
23
24
0
#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
25
0
        out0 = _mm_slli_si128(in0, shift); \
26
0
        out1 = _mm_alignr_epi8(in1, in0, shift); \
27
0
        out2 = _mm_alignr_epi8(in2, in1, shift); \
28
0
        out3 = _mm_alignr_epi8(in3, in2, shift); \
29
0
        out4 = _mm_srli_si128(in3, shift); \
30
0
        } while (0)
31
32
0
#define STORE4(out0, out1, out2, out3, out) do { \
33
0
        _mm_store_si128(out++, out0); \
34
0
        _mm_store_si128(out++, out1); \
35
0
        _mm_store_si128(out++, out2); \
36
0
        _mm_store_si128(out++, out3); \
37
0
    } while (0)
38
39
0
#define READ4(out0, out1, out2, out3, in) do { \
40
0
    out0 = _mm_load_si128(in++); \
41
0
    out1 = _mm_load_si128(in++); \
42
0
    out2 = _mm_load_si128(in++); \
43
0
    out3 = _mm_load_si128(in++); \
44
0
    } while (0)
45
46
/* This is intentionally shifted one down to compensate for the deferred store from
47
 * the last iteration */
48
0
#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
49
0
    out0 = _mm_xor_si128(in[1], xor0); \
50
0
    out1 = _mm_xor_si128(in[2], xor1); \
51
0
    out2 = _mm_xor_si128(in[3], xor2); \
52
0
    out3 = _mm_xor_si128(in[4], xor3); \
53
0
    } while (0)
54
55
0
Z_FORCEINLINE static uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t *input, size_t len) {
56
0
    ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
57
0
    __m128i *bitbuffer_v = (__m128i*)bitbuffer;
58
0
    const uint8_t *bitbuffer_bytes = (const uint8_t*)bitbuffer;
59
0
    __m128i z = _mm_setzero_si128();
60
61
0
    __m128i *bitbuf128 = &bitbuffer_v[64];
62
0
    __m128i *bitbuf144 = &bitbuffer_v[72];
63
0
    __m128i *bitbuf182 = &bitbuffer_v[91];
64
0
    __m128i *bitbuf210 = &bitbuffer_v[105];
65
0
    __m128i *bitbuf300 = &bitbuffer_v[150];
66
0
    __m128i *bitbuf0 = bitbuf128;
67
0
    __m128i *inptr = (__m128i*)input;
68
69
    /* We only need to zero out the bytes between the 128'th value and the 144th
70
     * that are actually read */
71
0
    __m128i *z_cursor = bitbuf128;
72
0
    for (size_t i = 0; i < 2; ++i) {
73
0
        STORE4(z, z, z, z, z_cursor);
74
0
    }
75
76
    /* We only need to zero out the bytes between the 144'th value and the 182nd that
77
     * are actually read */
78
0
    z_cursor = bitbuf144 + 8;
79
0
    for (size_t i = 0; i < 11; ++i) {
80
0
        _mm_store_si128(z_cursor++, z);
81
0
    }
82
83
    /* We only need to zero out the bytes between the 182nd value and the 210th that
84
     * are actually read. */
85
0
    z_cursor = bitbuf182;
86
0
    for (size_t i = 0; i < 4; ++i) {
87
0
        STORE4(z, z, z, z, z_cursor);
88
0
    }
89
90
    /* We need to mix this in */
91
0
    __m128i init_crc = _mm_cvtsi64_si128(~crc);
92
0
    crc = 0;
93
94
0
    size_t i = 0;
95
96
    /* Previous iteration runs carried over */
97
0
    __m128i buf144 = z;
98
0
    __m128i buf182 = z;
99
0
    __m128i buf210 = z;
100
101
0
    for (; i + 300*8+64 < len && i < 22 * 8; i += 64) {
102
0
        __m128i in12, in34, in56, in78,
103
0
                in_1, in23, in45, in67, in8_;
104
105
0
        READ4(in12, in34, in56, in78, inptr);
106
107
0
        if (i == 0) {
108
0
            in12 = _mm_xor_si128(in12, init_crc);
109
0
        }
110
111
0
        REALIGN_CHORBA(in12, in34, in56, in78,
112
0
                       in_1, in23, in45, in67, in8_, 8);
113
114
0
        __m128i a = _mm_xor_si128(buf144, in_1);
115
116
0
        STORE4(a, in23, in45, in67, bitbuf144);
117
0
        buf144 = in8_;
118
119
0
        __m128i e = _mm_xor_si128(buf182, in_1);
120
0
        STORE4(e, in23, in45, in67, bitbuf182);
121
0
        buf182 = in8_;
122
123
0
        __m128i m = _mm_xor_si128(buf210, in_1);
124
0
        STORE4(m, in23, in45, in67, bitbuf210);
125
0
        buf210 = in8_;
126
127
0
        STORE4(in12, in34, in56, in78, bitbuf300);
128
0
    }
129
130
0
    for (; i + 300*8+64 < len && i < 32 * 8; i += 64) {
131
0
        __m128i in12, in34, in56, in78,
132
0
                in_1, in23, in45, in67, in8_;
133
0
        READ4(in12, in34, in56, in78, inptr);
134
135
0
        REALIGN_CHORBA(in12, in34, in56, in78,
136
0
                       in_1, in23, in45, in67, in8_, 8);
137
138
0
        __m128i a = _mm_xor_si128(buf144, in_1);
139
140
0
        STORE4(a, in23, in45, in67, bitbuf144);
141
0
        buf144 = in8_;
142
143
0
        __m128i e, f, g, h;
144
0
        e = _mm_xor_si128(buf182, in_1);
145
0
        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
146
0
        STORE4(e, f, g, h, bitbuf182);
147
148
0
        __m128i m = _mm_xor_si128(buf210, in_1);
149
0
        STORE4(m, in23, in45, in67, bitbuf210);
150
0
        buf210 = in8_;
151
152
0
        STORE4(in12, in34, in56, in78, bitbuf300);
153
0
    }
154
155
0
    for (; i + 300*8+64 < len && i < 84 * 8; i += 64) {
156
0
        __m128i in12, in34, in56, in78,
157
0
                in_1, in23, in45, in67, in8_;
158
0
        READ4(in12, in34, in56, in78, inptr);
159
160
0
        REALIGN_CHORBA(in12, in34, in56, in78,
161
0
                       in_1, in23, in45, in67, in8_, 8);
162
163
0
        __m128i a, b, c, d;
164
0
        a = _mm_xor_si128(buf144, in_1);
165
0
        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
166
0
        STORE4(a, b, c, d, bitbuf144);
167
168
0
        __m128i e, f, g, h;
169
0
        e = _mm_xor_si128(buf182, in_1);
170
0
        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
171
0
        STORE4(e, f, g, h, bitbuf182);
172
173
0
        __m128i m = _mm_xor_si128(buf210, in_1);
174
0
        STORE4(m, in23, in45, in67, bitbuf210);
175
0
        buf210 = in8_;
176
177
0
        STORE4(in12, in34, in56, in78, bitbuf300);
178
0
    }
179
180
0
    for (; i + 300*8+64 < len; i += 64) {
181
0
        __m128i in12, in34, in56, in78,
182
0
                in_1, in23, in45, in67, in8_;
183
184
0
        if (i < 128 * 8) {
185
0
            READ4(in12, in34, in56, in78, inptr);
186
0
        } else {
187
0
            in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
188
0
            in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
189
0
            in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
190
0
            in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
191
0
        }
192
193
        // [0, 145, 183, 211]
194
195
        /* Pre Penryn CPUs the unpack should be faster */
196
0
        REALIGN_CHORBA(in12, in34, in56, in78,
197
0
                       in_1, in23, in45, in67, in8_, 8);
198
199
0
        __m128i a, b, c, d;
200
0
        a = _mm_xor_si128(buf144, in_1);
201
0
        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
202
0
        STORE4(a, b, c, d, bitbuf144);
203
204
0
        __m128i e, f, g, h;
205
0
        e = _mm_xor_si128(buf182, in_1);
206
0
        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
207
0
        STORE4(e, f, g, h, bitbuf182);
208
209
0
        __m128i n, o, p;
210
0
        __m128i m = _mm_xor_si128(buf210, in_1);
211
212
        /* Couldn't tell you why but despite knowing that this is always false,
213
         * removing this branch with GCC makes things significantly slower. Some
214
         * loop bodies must be being joined or something */
215
0
        if (i < 84 * 8) {
216
0
            n = in23;
217
0
            o = in45;
218
0
            p = in67;
219
0
            buf210 = in8_;
220
0
        } else {
221
0
            READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
222
0
        }
223
224
0
        STORE4(m, n, o, p, bitbuf210);
225
0
        STORE4(in12, in34, in56, in78, bitbuf300);
226
0
    }
227
228
    /* Second half of stores bubbled out */
229
0
    _mm_store_si128(bitbuf144, buf144);
230
0
    _mm_store_si128(bitbuf182, buf182);
231
0
    _mm_store_si128(bitbuf210, buf210);
232
233
    /* We also have to zero out the tail */
234
0
    size_t left_to_z = len - (300*8 + i);
235
0
    __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
236
0
    while (left_to_z >= 64) {
237
0
       STORE4(z, z, z, z, bitbuf_tail);
238
0
       left_to_z -= 64;
239
0
    }
240
241
0
    while (left_to_z >= 16) {
242
0
       _mm_store_si128(bitbuf_tail++, z);
243
0
       left_to_z -= 16;
244
0
    }
245
246
0
    uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
247
0
    while (left_to_z--) {
248
0
       *tail_bytes++ = 0;
249
0
    }
250
251
0
    ALIGNED_(16) uint64_t final[9] = {0};
252
0
    __m128i next12, next34, next56;
253
0
    next12 = z;
254
0
    next34 = z;
255
0
    next56 = z;
256
257
0
    for (; (i + 72 < len); i += 32) {
258
0
        __m128i in1in2, in3in4;
259
0
        __m128i in1in2_, in3in4_;
260
0
        __m128i ab1, ab2, ab3, ab4;
261
0
        __m128i cd1, cd2, cd3, cd4;
262
263
0
        READ_NEXT(input, i, in1in2, in3in4);
264
0
        READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
265
266
0
        in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
267
0
        in3in4 = _mm_xor_si128(in3in4, in3in4_);
268
269
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
270
271
0
        __m128i a2_ = _mm_slli_si128(ab2, 8);
272
0
        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
273
0
        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
274
0
        in3in4 = _mm_xor_si128(a2_, in3in4);
275
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
276
277
0
        __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
278
0
        __m128i a4_ = _mm_slli_si128(ab4, 8);
279
0
        a4_ = _mm_xor_si128(b2c2, a4_);
280
0
        next12 = _mm_xor_si128(ab3, a4_);
281
0
        next12 = _mm_xor_si128(next12, cd1);
282
283
0
        __m128i d2_ = _mm_srli_si128(cd2, 8);
284
0
        __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
285
0
        next12 = _mm_xor_si128(next12, next56);
286
0
        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
287
0
        next56 = _mm_srli_si128(cd4, 8);
288
0
    }
289
290
0
    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
291
0
    __m128i *final128 = (__m128i*)final;
292
0
    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
293
0
    ++final128;
294
0
    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
295
0
    ++final128;
296
0
    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
297
298
0
    uint8_t *final_bytes = (uint8_t*)final;
299
300
0
    for (size_t j = 0; j < (len-i); j++) {
301
0
        crc = crc_table[(crc ^ final_bytes[j] ^ bitbuffer_bytes[(j+i)]) & 0xff] ^ (crc >> 8);
302
0
    }
303
0
    return ~crc;
304
0
}
305
306
0
Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
307
0
    uintptr_t align_diff = ALIGN_DIFF(buf, 16);
308
0
    if (len <= align_diff + CHORBA_SMALL_THRESHOLD_64BIT)
309
0
        return crc32_braid(crc, buf, len);
310
311
0
    if (align_diff) {
312
0
        crc = crc32_braid(crc, buf, align_diff);
313
0
        len -= align_diff;
314
0
        buf += align_diff;
315
0
    }
316
0
#if !defined(WITHOUT_CHORBA)
317
0
    if (len > CHORBA_LARGE_THRESHOLD)
318
0
        return crc32_chorba_118960_nondestructive(crc, (z_word_t*)buf, len);
319
0
#endif
320
0
    if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
321
0
        return crc32_chorba_32768_nondestructive_sse41(crc, (const uint64_t*)buf, len);
322
0
    return chorba_small_nondestructive_sse2(crc, (const uint64_t*)buf, len);
323
0
}
324
325
0
Z_INTERNAL uint32_t crc32_copy_chorba_sse41(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
326
0
    crc = crc32_chorba_sse41(crc, src, len);
327
0
    memcpy(dst, src, len);
328
0
    return crc;
329
0
}
330
#endif