/src/zlib-ng/arch/x86/chorba_sse41.c

Source
#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)

#include "zbuild.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"
#include "crc32.h"
#include <emmintrin.h>
#include <smmintrin.h>
#include "arch/x86/x86_intrins.h"
#include "arch_functions.h"

#define READ_NEXT(in, off, a, b) do { \
        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
        } while (0);

#define NEXT_ROUND(invec, a, b, c, d) do { \
        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
        d  = _mm_srli_epi64(invec, 20); \
        } while (0);

#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
        out0 = _mm_slli_si128(in0, shift); \
        out1 = _mm_alignr_epi8(in1, in0, shift); \
        out2 = _mm_alignr_epi8(in2, in1, shift); \
        out3 = _mm_alignr_epi8(in3, in2, shift); \
        out4 = _mm_srli_si128(in3, shift); \
        } while (0)

#define STORE4(out0, out1, out2, out3, out) do { \
        _mm_store_si128(out++, out0); \
        _mm_store_si128(out++, out1); \
        _mm_store_si128(out++, out2); \
        _mm_store_si128(out++, out3); \
    } while (0)

#define READ4(out0, out1, out2, out3, in) do { \
    out0 = _mm_load_si128(in++); \
    out1 = _mm_load_si128(in++); \
    out2 = _mm_load_si128(in++); \
    out3 = _mm_load_si128(in++); \
    } while (0)

/* This is intentionally shifted one down to compensate for the deferred store from
 * the last iteration */
#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
    out0 = _mm_xor_si128(in[1], xor0); \
    out1 = _mm_xor_si128(in[2], xor1); \
    out2 = _mm_xor_si128(in[3], xor2); \
    out3 = _mm_xor_si128(in[4], xor3); \
    } while (0)

static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) {
    const uint64_t* input = buf;
    ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
    __m128i *bitbuffer_v = (__m128i*)bitbuffer;
    const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer;
    __m128i z = _mm_setzero_si128();

    __m128i *bitbuf128 = &bitbuffer_v[64];
    __m128i *bitbuf144 = &bitbuffer_v[72];
    __m128i *bitbuf182 = &bitbuffer_v[91];
    __m128i *bitbuf210 = &bitbuffer_v[105];
    __m128i *bitbuf300 = &bitbuffer_v[150];
    __m128i *bitbuf0 = bitbuf128;
    __m128i *inptr = (__m128i*)input;

    /* We only need to zero out the bytes between the 128'th value and the 144th
     * that are actually read */
    __m128i *z_cursor = bitbuf128;
    for (size_t i = 0; i < 2; ++i) {
        STORE4(z, z, z, z, z_cursor);
    }

    /* We only need to zero out the bytes between the 144'th value and the 182nd that
     * are actually read */
    z_cursor = bitbuf144 + 8;
    for (size_t i = 0; i < 11; ++i) {
        _mm_store_si128(z_cursor++, z);
    }

    /* We only need to zero out the bytes between the 182nd value and the 210th that
     * are actually read. */
    z_cursor = bitbuf182;
    for (size_t i = 0; i < 4; ++i) {
        STORE4(z, z, z, z, z_cursor);
    }

    /* We need to mix this in */
    __m128i init_crc = _mm_cvtsi64_si128(crc);
    crc = 0;

    size_t i = 0;

    /* Previous iteration runs carried over */
    __m128i buf144 = z;
    __m128i buf182 = z;
    __m128i buf210 = z;

    for(; i + 300*8+64 < len && i < 22 * 8; i += 64) {
        __m128i in12, in34, in56, in78,
                in_1, in23, in45, in67, in8_;

        READ4(in12, in34, in56, in78, inptr);

        if (i == 0) {
            in12 = _mm_xor_si128(in12, init_crc);
        }

        REALIGN_CHORBA(in12, in34, in56, in78,
                       in_1, in23, in45, in67, in8_, 8);

        __m128i a = _mm_xor_si128(buf144, in_1);

        STORE4(a, in23, in45, in67, bitbuf144);
        buf144 = in8_;

        __m128i e = _mm_xor_si128(buf182, in_1);
        STORE4(e, in23, in45, in67, bitbuf182);
        buf182 = in8_;

        __m128i m = _mm_xor_si128(buf210, in_1);
        STORE4(m, in23, in45, in67, bitbuf210);
        buf210 = in8_;

        STORE4(in12, in34, in56, in78, bitbuf300);
    }

    for(; i + 300*8+64 < len && i < 32 * 8; i += 64) {
        __m128i in12, in34, in56, in78,
                in_1, in23, in45, in67, in8_;
        READ4(in12, in34, in56, in78, inptr);

        REALIGN_CHORBA(in12, in34, in56, in78,
                       in_1, in23, in45, in67, in8_, 8);

        __m128i a = _mm_xor_si128(buf144, in_1);

        STORE4(a, in23, in45, in67, bitbuf144);
        buf144 = in8_;

        __m128i e, f, g, h;
        e = _mm_xor_si128(buf182, in_1);
        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
        STORE4(e, f, g, h, bitbuf182);

        __m128i m = _mm_xor_si128(buf210, in_1);
        STORE4(m, in23, in45, in67, bitbuf210);
        buf210 = in8_;

        STORE4(in12, in34, in56, in78, bitbuf300);
    }

    for(; i + 300*8+64 < len && i < 84 * 8; i += 64) {
        __m128i in12, in34, in56, in78,
                in_1, in23, in45, in67, in8_;
        READ4(in12, in34, in56, in78, inptr);

        REALIGN_CHORBA(in12, in34, in56, in78,
                       in_1, in23, in45, in67, in8_, 8);

        __m128i a, b, c, d;
        a = _mm_xor_si128(buf144, in_1);
        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
        STORE4(a, b, c, d, bitbuf144);

        __m128i e, f, g, h;
        e = _mm_xor_si128(buf182, in_1);
        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
        STORE4(e, f, g, h, bitbuf182);

        __m128i m = _mm_xor_si128(buf210, in_1);
        STORE4(m, in23, in45, in67, bitbuf210);
        buf210 = in8_;

        STORE4(in12, in34, in56, in78, bitbuf300);
    }

    for(; i + 300*8+64 < len; i += 64) {
        __m128i in12, in34, in56, in78,
                in_1, in23, in45, in67, in8_;

        if (i < 128 * 8) {
            READ4(in12, in34, in56, in78, inptr);
        } else {
            in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
            in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
            in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
            in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
        }

        // [0, 145, 183, 211]

        /* Pre Penryn CPUs the unpack should be faster */
        REALIGN_CHORBA(in12, in34, in56, in78,
                       in_1, in23, in45, in67, in8_, 8);

        __m128i a, b, c, d;
        a = _mm_xor_si128(buf144, in_1);
        READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
        STORE4(a, b, c, d, bitbuf144);

        __m128i e, f, g, h;
        e = _mm_xor_si128(buf182, in_1);
        READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
        STORE4(e, f, g, h, bitbuf182);

        __m128i n, o, p;
        __m128i m = _mm_xor_si128(buf210, in_1);

        /* Couldn't tell you why but despite knowing that this is always false,
         * removing this branch with GCC makes things significantly slower. Some
         * loop bodies must be being joined or something */
        if (i < 84 * 8) {
            n = in23;
            o = in45;
            p = in67;
            buf210 = in8_;
        } else {
            READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
        }

        STORE4(m, n, o, p, bitbuf210);
        STORE4(in12, in34, in56, in78, bitbuf300);
    }

    /* Second half of stores bubbled out */
    _mm_store_si128(bitbuf144, buf144);
    _mm_store_si128(bitbuf182, buf182);
    _mm_store_si128(bitbuf210, buf210);

    /* We also have to zero out the tail */
    size_t left_to_z = len - (300*8 + i);
    __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8);
    while (left_to_z >= 64) {
       STORE4(z, z, z, z, bitbuf_tail);
       left_to_z -= 64;
    }

    while (left_to_z >= 16) {
       _mm_store_si128(bitbuf_tail++, z);
       left_to_z -= 16;
    }

    uint8_t *tail_bytes = (uint8_t*)bitbuf_tail;
    while (left_to_z--) {
       *tail_bytes++ = 0;
    }

    ALIGNED_(16) uint64_t final[9] = {0};
    __m128i next12, next34, next56;
    next12 = z;
    next34 = z;
    next56 = z;

    for(; (i + 72 < len); i += 32) {
        __m128i in1in2, in3in4;
        __m128i in1in2_, in3in4_;
        __m128i ab1, ab2, ab3, ab4;
        __m128i cd1, cd2, cd3, cd4;

        READ_NEXT(input, i, in1in2, in3in4);
        READ_NEXT(bitbuffer, i, in1in2_, in3in4_);

        in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
        in3in4 = _mm_xor_si128(in3in4, in3in4_);

        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);

        __m128i a2_ = _mm_slli_si128(ab2, 8);
        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
        in3in4 = _mm_xor_si128(a2_, in3in4);
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);

        __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
        __m128i a4_ = _mm_slli_si128(ab4, 8);
        a4_ = _mm_xor_si128(b2c2, a4_);
        next12 = _mm_xor_si128(ab3, a4_);
        next12 = _mm_xor_si128(next12, cd1);

        __m128i d2_ = _mm_srli_si128(cd2, 8);
        __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
        next12 = _mm_xor_si128(next12, next56);
        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
        next56 = _mm_srli_si128(cd4, 8);
    }

    memcpy(final, input+(i / sizeof(uint64_t)), len-i);
    __m128i *final128 = (__m128i*)final;
    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
    ++final128;
    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
    ++final128;
    _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));

    uint8_t* final_bytes = (uint8_t*) final;

    for(size_t j = 0; j < (len-i); j++) {
        crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8);
    }
    return crc;
}

Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
    uint64_t* aligned_buf;
    uint32_t c = (~crc) & 0xffffffff;
    uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;

    if (len > algn_diff + CHORBA_SMALL_THRESHOLD_64BIT) {
        if (algn_diff) {
            c = crc32_braid_internal(c, buf, algn_diff);
            len -= algn_diff;
        }
        aligned_buf = (uint64_t*) (buf + algn_diff);
#if !defined(WITHOUT_CHORBA)
        if(len > CHORBA_LARGE_THRESHOLD) {
            c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, len);
        } else
#endif
        if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
            c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, len);
        } else {
            c = chorba_small_nondestructive_sse2(c, aligned_buf, len);
        }
    } else {
        // Process too short lengths using crc32_braid
        c = crc32_braid_internal(c, buf, len);
    }

    /* Return the CRC, post-conditioned. */
    return c ^ 0xffffffff;
}
#endif

Coverage Report

Created: 2025-11-24 06:41

Line	Count	Source
1		#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
2
3		#include "zbuild.h"
4		#include "crc32_braid_p.h"
5		#include "crc32_braid_tbl.h"
6		#include "crc32.h"
7		#include <emmintrin.h>
8		#include <smmintrin.h>
9		#include "arch/x86/x86_intrins.h"
10		#include "arch_functions.h"
11
12	0	#define READ_NEXT(in, off, a, b) do { \
13	0	a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
14	0	b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
15	0	} while (0);
16
17	0	#define NEXT_ROUND(invec, a, b, c, d) do { \
18	0	a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
19	0	b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
20	0	c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
21	0	d = _mm_srli_epi64(invec, 20); \
22	0	} while (0);
23
24	0	#define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \
25	0	out0 = _mm_slli_si128(in0, shift); \
26	0	out1 = _mm_alignr_epi8(in1, in0, shift); \
27	0	out2 = _mm_alignr_epi8(in2, in1, shift); \
28	0	out3 = _mm_alignr_epi8(in3, in2, shift); \
29	0	out4 = _mm_srli_si128(in3, shift); \
30	0	} while (0)
31
32	0	#define STORE4(out0, out1, out2, out3, out) do { \
33	0	_mm_store_si128(out++, out0); \
34	0	_mm_store_si128(out++, out1); \
35	0	_mm_store_si128(out++, out2); \
36	0	_mm_store_si128(out++, out3); \
37	0	} while (0)
38
39	0	#define READ4(out0, out1, out2, out3, in) do { \
40	0	out0 = _mm_load_si128(in++); \
41	0	out1 = _mm_load_si128(in++); \
42	0	out2 = _mm_load_si128(in++); \
43	0	out3 = _mm_load_si128(in++); \
44	0	} while (0)
45
46		/* This is intentionally shifted one down to compensate for the deferred store from
47		* the last iteration */
48	0	#define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \
49	0	out0 = _mm_xor_si128(in[1], xor0); \
50	0	out1 = _mm_xor_si128(in[2], xor1); \
51	0	out2 = _mm_xor_si128(in[3], xor2); \
52	0	out3 = _mm_xor_si128(in[4], xor3); \
53	0	} while (0)
54
55	0	static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) {
56	0	const uint64_t* input = buf;
57	0	ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)];
58	0	__m128i bitbuffer_v = (__m128i)bitbuffer;
59	0	const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer;
60	0	__m128i z = _mm_setzero_si128();
61
62	0	__m128i *bitbuf128 = &bitbuffer_v[64];
63	0	__m128i *bitbuf144 = &bitbuffer_v[72];
64	0	__m128i *bitbuf182 = &bitbuffer_v[91];
65	0	__m128i *bitbuf210 = &bitbuffer_v[105];
66	0	__m128i *bitbuf300 = &bitbuffer_v[150];
67	0	__m128i *bitbuf0 = bitbuf128;
68	0	__m128i inptr = (__m128i)input;
69
70		/* We only need to zero out the bytes between the 128'th value and the 144th
71		* that are actually read */
72	0	__m128i *z_cursor = bitbuf128;
73	0	for (size_t i = 0; i < 2; ++i) {
74	0	STORE4(z, z, z, z, z_cursor);
75	0	}
76
77		/* We only need to zero out the bytes between the 144'th value and the 182nd that
78		* are actually read */
79	0	z_cursor = bitbuf144 + 8;
80	0	for (size_t i = 0; i < 11; ++i) {
81	0	_mm_store_si128(z_cursor++, z);
82	0	}
83
84		/* We only need to zero out the bytes between the 182nd value and the 210th that
85		* are actually read. */
86	0	z_cursor = bitbuf182;
87	0	for (size_t i = 0; i < 4; ++i) {
88	0	STORE4(z, z, z, z, z_cursor);
89	0	}
90
91		/* We need to mix this in */
92	0	__m128i init_crc = _mm_cvtsi64_si128(crc);
93	0	crc = 0;
94
95	0	size_t i = 0;
96
97		/* Previous iteration runs carried over */
98	0	__m128i buf144 = z;
99	0	__m128i buf182 = z;
100	0	__m128i buf210 = z;
101
102	0	for(; i + 3008+64 < len && i < 22 8; i += 64) {
103	0	__m128i in12, in34, in56, in78,
104	0	in_1, in23, in45, in67, in8_;
105
106	0	READ4(in12, in34, in56, in78, inptr);
107
108	0	if (i == 0) {
109	0	in12 = _mm_xor_si128(in12, init_crc);
110	0	}
111
112	0	REALIGN_CHORBA(in12, in34, in56, in78,
113	0	in_1, in23, in45, in67, in8_, 8);
114
115	0	__m128i a = _mm_xor_si128(buf144, in_1);
116
117	0	STORE4(a, in23, in45, in67, bitbuf144);
118	0	buf144 = in8_;
119
120	0	__m128i e = _mm_xor_si128(buf182, in_1);
121	0	STORE4(e, in23, in45, in67, bitbuf182);
122	0	buf182 = in8_;
123
124	0	__m128i m = _mm_xor_si128(buf210, in_1);
125	0	STORE4(m, in23, in45, in67, bitbuf210);
126	0	buf210 = in8_;
127
128	0	STORE4(in12, in34, in56, in78, bitbuf300);
129	0	}
130
131	0	for(; i + 3008+64 < len && i < 32 8; i += 64) {
132	0	__m128i in12, in34, in56, in78,
133	0	in_1, in23, in45, in67, in8_;
134	0	READ4(in12, in34, in56, in78, inptr);
135
136	0	REALIGN_CHORBA(in12, in34, in56, in78,
137	0	in_1, in23, in45, in67, in8_, 8);
138
139	0	__m128i a = _mm_xor_si128(buf144, in_1);
140
141	0	STORE4(a, in23, in45, in67, bitbuf144);
142	0	buf144 = in8_;
143
144	0	__m128i e, f, g, h;
145	0	e = _mm_xor_si128(buf182, in_1);
146	0	READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
147	0	STORE4(e, f, g, h, bitbuf182);
148
149	0	__m128i m = _mm_xor_si128(buf210, in_1);
150	0	STORE4(m, in23, in45, in67, bitbuf210);
151	0	buf210 = in8_;
152
153	0	STORE4(in12, in34, in56, in78, bitbuf300);
154	0	}
155
156	0	for(; i + 3008+64 < len && i < 84 8; i += 64) {
157	0	__m128i in12, in34, in56, in78,
158	0	in_1, in23, in45, in67, in8_;
159	0	READ4(in12, in34, in56, in78, inptr);
160
161	0	REALIGN_CHORBA(in12, in34, in56, in78,
162	0	in_1, in23, in45, in67, in8_, 8);
163
164	0	__m128i a, b, c, d;
165	0	a = _mm_xor_si128(buf144, in_1);
166	0	READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
167	0	STORE4(a, b, c, d, bitbuf144);
168
169	0	__m128i e, f, g, h;
170	0	e = _mm_xor_si128(buf182, in_1);
171	0	READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
172	0	STORE4(e, f, g, h, bitbuf182);
173
174	0	__m128i m = _mm_xor_si128(buf210, in_1);
175	0	STORE4(m, in23, in45, in67, bitbuf210);
176	0	buf210 = in8_;
177
178	0	STORE4(in12, in34, in56, in78, bitbuf300);
179	0	}
180
181	0	for(; i + 300*8+64 < len; i += 64) {
182	0	__m128i in12, in34, in56, in78,
183	0	in_1, in23, in45, in67, in8_;
184
185	0	if (i < 128 * 8) {
186	0	READ4(in12, in34, in56, in78, inptr);
187	0	} else {
188	0	in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
189	0	in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
190	0	in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
191	0	in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++));
192	0	}
193
194		// [0, 145, 183, 211]
195
196		/* Pre Penryn CPUs the unpack should be faster */
197	0	REALIGN_CHORBA(in12, in34, in56, in78,
198	0	in_1, in23, in45, in67, in8_, 8);
199
200	0	__m128i a, b, c, d;
201	0	a = _mm_xor_si128(buf144, in_1);
202	0	READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144);
203	0	STORE4(a, b, c, d, bitbuf144);
204
205	0	__m128i e, f, g, h;
206	0	e = _mm_xor_si128(buf182, in_1);
207	0	READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182);
208	0	STORE4(e, f, g, h, bitbuf182);
209
210	0	__m128i n, o, p;
211	0	__m128i m = _mm_xor_si128(buf210, in_1);
212
213		/* Couldn't tell you why but despite knowing that this is always false,
214		* removing this branch with GCC makes things significantly slower. Some
215		* loop bodies must be being joined or something */
216	0	if (i < 84 * 8) {
217	0	n = in23;
218	0	o = in45;
219	0	p = in67;
220	0	buf210 = in8_;
221	0	} else {
222	0	READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210);
223	0	}
224
225	0	STORE4(m, n, o, p, bitbuf210);
226	0	STORE4(in12, in34, in56, in78, bitbuf300);
227	0	}
228
229		/* Second half of stores bubbled out */
230	0	_mm_store_si128(bitbuf144, buf144);
231	0	_mm_store_si128(bitbuf182, buf182);
232	0	_mm_store_si128(bitbuf210, buf210);
233
234		/* We also have to zero out the tail */
235	0	size_t left_to_z = len - (300*8 + i);
236	0	__m128i bitbuf_tail = (__m128i)(bitbuffer + 300 + i/8);
237	0	while (left_to_z >= 64) {
238	0	STORE4(z, z, z, z, bitbuf_tail);
239	0	left_to_z -= 64;
240	0	}
241
242	0	while (left_to_z >= 16) {
243	0	_mm_store_si128(bitbuf_tail++, z);
244	0	left_to_z -= 16;
245	0	}
246
247	0	uint8_t tail_bytes = (uint8_t)bitbuf_tail;
248	0	while (left_to_z--) {
249	0	*tail_bytes++ = 0;
250	0	}
251
252	0	ALIGNED_(16) uint64_t final[9] = {0};
253	0	__m128i next12, next34, next56;
254	0	next12 = z;
255	0	next34 = z;
256	0	next56 = z;
257
258	0	for(; (i + 72 < len); i += 32) {
259	0	__m128i in1in2, in3in4;
260	0	__m128i in1in2_, in3in4_;
261	0	__m128i ab1, ab2, ab3, ab4;
262	0	__m128i cd1, cd2, cd3, cd4;
263
264	0	READ_NEXT(input, i, in1in2, in3in4);
265	0	READ_NEXT(bitbuffer, i, in1in2_, in3in4_);
266
267	0	in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12);
268	0	in3in4 = _mm_xor_si128(in3in4, in3in4_);
269
270	0	NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
271
272	0	__m128i a2_ = _mm_slli_si128(ab2, 8);
273	0	__m128i ab1_next34 = _mm_xor_si128(next34, ab1);
274	0	in3in4 = _mm_xor_si128(in3in4, ab1_next34);
275	0	in3in4 = _mm_xor_si128(a2_, in3in4);
276	0	NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
277
278	0	__m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8);
279	0	__m128i a4_ = _mm_slli_si128(ab4, 8);
280	0	a4_ = _mm_xor_si128(b2c2, a4_);
281	0	next12 = _mm_xor_si128(ab3, a4_);
282	0	next12 = _mm_xor_si128(next12, cd1);
283
284	0	__m128i d2_ = _mm_srli_si128(cd2, 8);
285	0	__m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8);
286	0	next12 = _mm_xor_si128(next12, next56);
287	0	next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
288	0	next56 = _mm_srli_si128(cd4, 8);
289	0	}
290
291	0	memcpy(final, input+(i / sizeof(uint64_t)), len-i);
292	0	__m128i final128 = (__m128i)final;
293	0	_mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12));
294	0	++final128;
295	0	_mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34));
296	0	++final128;
297	0	_mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56));
298
299	0	uint8_t* final_bytes = (uint8_t*) final;
300
301	0	for(size_t j = 0; j < (len-i); j++) {
302	0	crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8);
303	0	}
304	0	return crc;
305	0	}
306
307	0	Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) {
308	0	uint64_t* aligned_buf;
309	0	uint32_t c = (~crc) & 0xffffffff;
310	0	uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
311
312	0	if (len > algn_diff + CHORBA_SMALL_THRESHOLD_64BIT) {
313	0	if (algn_diff) {
314	0	c = crc32_braid_internal(c, buf, algn_diff);
315	0	len -= algn_diff;
316	0	}
317	0	aligned_buf = (uint64_t*) (buf + algn_diff);
318	0	#if !defined(WITHOUT_CHORBA)
319	0	if(len > CHORBA_LARGE_THRESHOLD) {
320	0	c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, len);
321	0	} else
322	0	#endif
323	0	if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) {
324	0	c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, len);
325	0	} else {
326	0	c = chorba_small_nondestructive_sse2(c, aligned_buf, len);
327	0	}
328	0	} else {
329		// Process too short lengths using crc32_braid
330	0	c = crc32_braid_internal(c, buf, len);
331	0	}
332
333		/* Return the CRC, post-conditioned. */
334	0	return c ^ 0xffffffff;
335	0	}
336		#endif