/src/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
Line | Count | Source |
1 | | /* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding |
2 | | * approach with the PCLMULQDQ and VPCMULQDQ instructions. |
3 | | * |
4 | | * A white paper describing this algorithm can be found at: |
5 | | * doc/crc-pclmulqdq.pdf |
6 | | * |
7 | | * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support) |
8 | | * Copyright (C) 2013 Intel Corporation. All rights reserved. |
9 | | * Copyright (C) 2016 Marian Beermann (support for initial value) |
10 | | * Authors: |
11 | | * Wajdi Feghali <wajdi.k.feghali@intel.com> |
12 | | * Jim Guilford <james.guilford@intel.com> |
13 | | * Vinodh Gopal <vinodh.gopal@intel.com> |
14 | | * Erdinc Ozturk <erdinc.ozturk@intel.com> |
15 | | * Jim Kukunas <james.t.kukunas@linux.intel.com> |
16 | | * |
17 | | * For conditions of distribution and use, see copyright notice in zlib.h |
18 | | */ |
19 | | |
20 | | #include "zbuild.h" |
21 | | |
22 | | #include <immintrin.h> |
23 | | #include <wmmintrin.h> |
24 | | #include <smmintrin.h> // _mm_extract_epi32 |
25 | | |
26 | | #include "crc32_braid_p.h" |
27 | | #include "crc32_braid_tbl.h" |
28 | | #include "crc32_p.h" |
29 | | #include "x86_intrins.h" |
30 | | |
31 | | /* 512-bit VPCLMULQDQ path requires AVX-512F */ |
32 | | #if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) |
33 | | # if defined(_MSC_VER) && _MSC_VER < 1920 |
34 | | /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */ |
35 | | # define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96) |
36 | | # define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm) |
37 | | # define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm) |
38 | | # else |
39 | 0 | # define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96) |
40 | | # if defined(__AVX512DQ__) |
41 | | # if defined(_MSC_VER) && !defined(_MM_K0_REG8) |
42 | | # define z512_inserti64x2(a, b, imm) _mm512_maskz_inserti64x2(UINT8_MAX, a, b, imm) |
43 | | # else |
44 | 0 | # define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm) |
45 | | # endif |
46 | 0 | # define z512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm) |
47 | | # else |
48 | | # define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm) |
49 | | # define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm) |
50 | | # endif |
51 | | # endif |
52 | | # ifdef __AVX512VL__ |
53 | 0 | # define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96) |
54 | | # endif |
55 | | #endif |
56 | | /* 256-bit VPCLMULQDQ macros (doesn't require AVX-512) */ |
57 | | #if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) |
58 | 0 | # define z256_xor3_epi64(a, b, c) _mm256_xor_si256(_mm256_xor_si256(a, b), c) |
59 | | #endif |
60 | | |
61 | | #ifndef z128_xor3_epi64 |
62 | 37.0M | # define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c) |
63 | | #endif |
64 | | |
65 | | static inline void fold_state_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, |
66 | 1.34M | const __m128i xmm_fold4) { |
67 | 1.34M | __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
68 | 1.34M | __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
69 | | |
70 | 1.34M | *xmm_crc0 = *xmm_crc1; |
71 | 1.34M | *xmm_crc1 = *xmm_crc2; |
72 | 1.34M | *xmm_crc2 = *xmm_crc3; |
73 | 1.34M | *xmm_crc3 = _mm_xor_si128(x_low, x_high); |
74 | 1.34M | } crc32_pclmulqdq.c:fold_state_1 Line | Count | Source | 66 | 1.34M | const __m128i xmm_fold4) { | 67 | 1.34M | __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 68 | 1.34M | __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 69 | | | 70 | 1.34M | *xmm_crc0 = *xmm_crc1; | 71 | 1.34M | *xmm_crc1 = *xmm_crc2; | 72 | 1.34M | *xmm_crc2 = *xmm_crc3; | 73 | 1.34M | *xmm_crc3 = _mm_xor_si128(x_low, x_high); | 74 | 1.34M | } |
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_1 Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_1 |
75 | | |
76 | | static inline void fold_state_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, |
77 | 194k | const __m128i xmm_fold4) { |
78 | 194k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
79 | 194k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
80 | 194k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
81 | 194k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
82 | | |
83 | 194k | *xmm_crc0 = *xmm_crc2; |
84 | 194k | *xmm_crc1 = *xmm_crc3; |
85 | 194k | *xmm_crc2 = _mm_xor_si128(x_low0, x_high0); |
86 | 194k | *xmm_crc3 = _mm_xor_si128(x_low1, x_high1); |
87 | 194k | } crc32_pclmulqdq.c:fold_state_2 Line | Count | Source | 77 | 194k | const __m128i xmm_fold4) { | 78 | 194k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 79 | 194k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 80 | 194k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); | 81 | 194k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); | 82 | | | 83 | 194k | *xmm_crc0 = *xmm_crc2; | 84 | 194k | *xmm_crc1 = *xmm_crc3; | 85 | 194k | *xmm_crc2 = _mm_xor_si128(x_low0, x_high0); | 86 | 194k | *xmm_crc3 = _mm_xor_si128(x_low1, x_high1); | 87 | 194k | } |
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_2 Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_2 |
88 | | |
89 | | static inline void fold_state_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, |
90 | 96.1k | const __m128i xmm_fold4) { |
91 | 96.1k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
92 | 96.1k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
93 | 96.1k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
94 | 96.1k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
95 | 96.1k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); |
96 | 96.1k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); |
97 | | |
98 | 96.1k | *xmm_crc0 = *xmm_crc3; |
99 | 96.1k | *xmm_crc1 = _mm_xor_si128(x_low0, x_high0); |
100 | 96.1k | *xmm_crc2 = _mm_xor_si128(x_low1, x_high1); |
101 | 96.1k | *xmm_crc3 = _mm_xor_si128(x_low2, x_high2); |
102 | 96.1k | } crc32_pclmulqdq.c:fold_state_3 Line | Count | Source | 90 | 96.1k | const __m128i xmm_fold4) { | 91 | 96.1k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 92 | 96.1k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 93 | 96.1k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); | 94 | 96.1k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); | 95 | 96.1k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); | 96 | 96.1k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); | 97 | | | 98 | 96.1k | *xmm_crc0 = *xmm_crc3; | 99 | 96.1k | *xmm_crc1 = _mm_xor_si128(x_low0, x_high0); | 100 | 96.1k | *xmm_crc2 = _mm_xor_si128(x_low1, x_high1); | 101 | 96.1k | *xmm_crc3 = _mm_xor_si128(x_low2, x_high2); | 102 | 96.1k | } |
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_3 Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_3 |
103 | | |
104 | | static inline void fold_state_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, |
105 | 14.8M | const __m128i xmm_fold4) { |
106 | 14.8M | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
107 | 14.8M | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
108 | 14.8M | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
109 | 14.8M | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
110 | 14.8M | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); |
111 | 14.8M | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); |
112 | 14.8M | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); |
113 | 14.8M | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); |
114 | | |
115 | 14.8M | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); |
116 | 14.8M | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); |
117 | 14.8M | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); |
118 | 14.8M | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); |
119 | 14.8M | } crc32_pclmulqdq.c:fold_state_4 Line | Count | Source | 105 | 14.8M | const __m128i xmm_fold4) { | 106 | 14.8M | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 107 | 14.8M | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 108 | 14.8M | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); | 109 | 14.8M | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); | 110 | 14.8M | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); | 111 | 14.8M | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); | 112 | 14.8M | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); | 113 | 14.8M | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); | 114 | | | 115 | 14.8M | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); | 116 | 14.8M | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); | 117 | 14.8M | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); | 118 | 14.8M | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); | 119 | 14.8M | } |
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_4 Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_4 |
120 | | |
121 | 967k | static inline void fold_state_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { |
122 | 967k | const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); |
123 | 967k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01); |
124 | 967k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10); |
125 | 967k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01); |
126 | 967k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10); |
127 | 967k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01); |
128 | 967k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10); |
129 | 967k | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01); |
130 | 967k | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10); |
131 | | |
132 | 967k | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); |
133 | 967k | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); |
134 | 967k | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); |
135 | 967k | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); |
136 | 967k | } crc32_pclmulqdq.c:fold_state_12 Line | Count | Source | 121 | 967k | static inline void fold_state_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { | 122 | 967k | const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); | 123 | 967k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01); | 124 | 967k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10); | 125 | 967k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01); | 126 | 967k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10); | 127 | 967k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01); | 128 | 967k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10); | 129 | 967k | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01); | 130 | 967k | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10); | 131 | | | 132 | 967k | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); | 133 | 967k | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); | 134 | 967k | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); | 135 | 967k | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); | 136 | 967k | } |
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:fold_state_12 Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:fold_state_12 |
137 | | |
138 | | /* 512-bit fold function requires AVX-512F */ |
139 | | #if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) |
140 | | static inline void fold_state_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3, |
141 | | const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, |
142 | 0 | const __m512i zmm_t3, const __m512i zmm_fold16) { |
143 | 0 | __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01); |
144 | 0 | __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10); |
145 | 0 | __m512i z_low1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01); |
146 | 0 | __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10); |
147 | 0 | __m512i z_low2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01); |
148 | 0 | __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10); |
149 | 0 | __m512i z_low3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01); |
150 | 0 | __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10); |
151 | |
|
152 | 0 | *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0); |
153 | 0 | *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1); |
154 | 0 | *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2); |
155 | 0 | *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3); |
156 | 0 | } |
157 | | |
158 | | static inline void fold_block_16(const uint8_t **src, uint8_t **dst, size_t *len, __m128i *xmm_crc0, |
159 | 0 | __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const int COPY) { |
160 | 0 | *len -= 256; |
161 | |
|
162 | 0 | __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; |
163 | 0 | __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; |
164 | 0 | __m512i z_low0, z_high0; |
165 | 0 | const __m512i zmm_fold4 = _mm512_set4_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
166 | 0 | const __m512i zmm_fold16 = _mm512_set4_epi32(0x00000001, 0x1542778a, 0x00000001, 0x322d1430); |
167 | |
|
168 | 0 | zmm_crc0 = _mm512_loadu_si512((__m512i *)*src); |
169 | 0 | zmm_crc1 = _mm512_loadu_si512((__m512i *)*src + 1); |
170 | 0 | zmm_crc2 = _mm512_loadu_si512((__m512i *)*src + 2); |
171 | 0 | zmm_crc3 = _mm512_loadu_si512((__m512i *)*src + 3); |
172 | 0 | *src += 256; |
173 | 0 | if (COPY) { |
174 | 0 | _mm512_storeu_si512((__m512i *)*dst, zmm_crc0); |
175 | 0 | _mm512_storeu_si512((__m512i *)*dst + 1, zmm_crc1); |
176 | 0 | _mm512_storeu_si512((__m512i *)*dst + 2, zmm_crc2); |
177 | 0 | _mm512_storeu_si512((__m512i *)*dst + 3, zmm_crc3); |
178 | 0 | *dst += 256; |
179 | 0 | } |
180 | | |
181 | | // Fold existing xmm state into first 64 bytes |
182 | 0 | zmm_t0 = _mm512_castsi128_si512(*xmm_crc0); |
183 | 0 | zmm_t0 = z512_inserti64x2(zmm_t0, *xmm_crc1, 1); |
184 | 0 | zmm_t0 = z512_inserti64x2(zmm_t0, *xmm_crc2, 2); |
185 | 0 | zmm_t0 = z512_inserti64x2(zmm_t0, *xmm_crc3, 3); |
186 | |
|
187 | 0 | z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01); |
188 | 0 | z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10); |
189 | 0 | zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0); |
190 | |
|
191 | 0 | while (*len >= 256) { |
192 | 0 | *len -= 256; |
193 | 0 | zmm_t0 = _mm512_loadu_si512((__m512i *)*src); |
194 | 0 | zmm_t1 = _mm512_loadu_si512((__m512i *)*src + 1); |
195 | 0 | zmm_t2 = _mm512_loadu_si512((__m512i *)*src + 2); |
196 | 0 | zmm_t3 = _mm512_loadu_si512((__m512i *)*src + 3); |
197 | 0 | *src += 256; |
198 | |
|
199 | 0 | fold_state_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16); |
200 | 0 | if (COPY) { |
201 | 0 | _mm512_storeu_si512((__m512i *)*dst, zmm_t0); |
202 | 0 | _mm512_storeu_si512((__m512i *)*dst + 1, zmm_t1); |
203 | 0 | _mm512_storeu_si512((__m512i *)*dst + 2, zmm_t2); |
204 | 0 | _mm512_storeu_si512((__m512i *)*dst + 3, zmm_t3); |
205 | 0 | *dst += 256; |
206 | 0 | } |
207 | 0 | } |
208 | | |
209 | | // zmm_crc[0,1,2,3] -> zmm_crc0 |
210 | 0 | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
211 | 0 | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
212 | 0 | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1); |
213 | |
|
214 | 0 | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
215 | 0 | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
216 | 0 | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2); |
217 | |
|
218 | 0 | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
219 | 0 | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
220 | 0 | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3); |
221 | | |
222 | | // zmm_crc0 -> xmm_crc[0, 1, 2, 3] |
223 | 0 | *xmm_crc0 = z512_extracti64x2(zmm_crc0, 0); |
224 | 0 | *xmm_crc1 = z512_extracti64x2(zmm_crc0, 1); |
225 | 0 | *xmm_crc2 = z512_extracti64x2(zmm_crc0, 2); |
226 | 0 | *xmm_crc3 = z512_extracti64x2(zmm_crc0, 3); |
227 | 0 | } |
228 | | #endif |
229 | | /* 256-bit fold function for VPCLMULQDQ without AVX-512 */ |
230 | | #if defined(X86_VPCLMULQDQ) && !defined(__AVX512F__) |
231 | | static inline void fold_state_8(__m256i *ymm_crc0, __m256i *ymm_crc1, __m256i *ymm_crc2, __m256i *ymm_crc3, |
232 | | const __m256i ymm_t0, const __m256i ymm_t1, const __m256i ymm_t2, |
233 | 0 | const __m256i ymm_t3, const __m256i ymm_fold8) { |
234 | 0 | __m256i y_low0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x01); |
235 | 0 | __m256i y_high0 = _mm256_clmulepi64_epi128(*ymm_crc0, ymm_fold8, 0x10); |
236 | 0 | __m256i y_low1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x01); |
237 | 0 | __m256i y_high1 = _mm256_clmulepi64_epi128(*ymm_crc1, ymm_fold8, 0x10); |
238 | 0 | __m256i y_low2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x01); |
239 | 0 | __m256i y_high2 = _mm256_clmulepi64_epi128(*ymm_crc2, ymm_fold8, 0x10); |
240 | 0 | __m256i y_low3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x01); |
241 | 0 | __m256i y_high3 = _mm256_clmulepi64_epi128(*ymm_crc3, ymm_fold8, 0x10); |
242 | |
|
243 | 0 | *ymm_crc0 = z256_xor3_epi64(y_low0, y_high0, ymm_t0); |
244 | 0 | *ymm_crc1 = z256_xor3_epi64(y_low1, y_high1, ymm_t1); |
245 | 0 | *ymm_crc2 = z256_xor3_epi64(y_low2, y_high2, ymm_t2); |
246 | 0 | *ymm_crc3 = z256_xor3_epi64(y_low3, y_high3, ymm_t3); |
247 | 0 | } |
248 | | |
249 | | static inline void fold_block_8(const uint8_t **src, uint8_t **dst, size_t *len, __m128i *xmm_crc0, |
250 | | __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, |
251 | 0 | const __m128i xmm_fold4, const int COPY) { |
252 | 0 | *len -= 128; |
253 | |
|
254 | 0 | __m256i ymm_crc0, ymm_crc1, ymm_crc2, ymm_crc3; |
255 | 0 | __m256i ymm_t0, ymm_t1, ymm_t2, ymm_t3; |
256 | 0 | __m256i y_low0, y_high0; |
257 | 0 | const __m256i ymm_fold4 = _mm256_set_epi32( |
258 | 0 | 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596, |
259 | 0 | 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
260 | 0 | const __m256i ymm_fold8 = _mm256_set_epi32( |
261 | 0 | 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880, |
262 | 0 | 0x00000001, 0xe88ef372, 0x00000001, 0x4a7fe880); |
263 | |
|
264 | 0 | ymm_crc0 = _mm256_loadu_si256((__m256i *)*src); |
265 | 0 | ymm_crc1 = _mm256_loadu_si256((__m256i *)*src + 1); |
266 | 0 | ymm_crc2 = _mm256_loadu_si256((__m256i *)*src + 2); |
267 | 0 | ymm_crc3 = _mm256_loadu_si256((__m256i *)*src + 3); |
268 | 0 | *src += 128; |
269 | 0 | if (COPY) { |
270 | 0 | _mm256_storeu_si256((__m256i *)*dst, ymm_crc0); |
271 | 0 | _mm256_storeu_si256((__m256i *)*dst + 1, ymm_crc1); |
272 | 0 | _mm256_storeu_si256((__m256i *)*dst + 2, ymm_crc2); |
273 | 0 | _mm256_storeu_si256((__m256i *)*dst + 3, ymm_crc3); |
274 | 0 | *dst += 128; |
275 | 0 | } |
276 | | |
277 | | // Fold existing xmm state into first 32 bytes |
278 | 0 | ymm_t0 = _mm256_castsi128_si256(*xmm_crc0); |
279 | 0 | ymm_t0 = _mm256_inserti128_si256(ymm_t0, *xmm_crc1, 1); |
280 | |
|
281 | 0 | y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); |
282 | 0 | y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); |
283 | 0 | ymm_crc0 = z256_xor3_epi64(ymm_crc0, y_low0, y_high0); |
284 | |
|
285 | 0 | ymm_t0 = _mm256_castsi128_si256(*xmm_crc2); |
286 | 0 | ymm_t0 = _mm256_inserti128_si256(ymm_t0, *xmm_crc3, 1); |
287 | |
|
288 | 0 | y_low0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x01); |
289 | 0 | y_high0 = _mm256_clmulepi64_epi128(ymm_t0, ymm_fold4, 0x10); |
290 | 0 | ymm_crc1 = z256_xor3_epi64(ymm_crc1, y_low0, y_high0); |
291 | |
|
292 | 0 | while (*len >= 128) { |
293 | 0 | *len -= 128; |
294 | 0 | ymm_t0 = _mm256_loadu_si256((__m256i *)*src); |
295 | 0 | ymm_t1 = _mm256_loadu_si256((__m256i *)*src + 1); |
296 | 0 | ymm_t2 = _mm256_loadu_si256((__m256i *)*src + 2); |
297 | 0 | ymm_t3 = _mm256_loadu_si256((__m256i *)*src + 3); |
298 | 0 | *src += 128; |
299 | |
|
300 | 0 | fold_state_8(&ymm_crc0, &ymm_crc1, &ymm_crc2, &ymm_crc3, ymm_t0, ymm_t1, ymm_t2, ymm_t3, ymm_fold8); |
301 | 0 | if (COPY) { |
302 | 0 | _mm256_storeu_si256((__m256i *)*dst, ymm_t0); |
303 | 0 | _mm256_storeu_si256((__m256i *)*dst + 1, ymm_t1); |
304 | 0 | _mm256_storeu_si256((__m256i *)*dst + 2, ymm_t2); |
305 | 0 | _mm256_storeu_si256((__m256i *)*dst + 3, ymm_t3); |
306 | 0 | *dst += 128; |
307 | 0 | } |
308 | 0 | } |
309 | | |
310 | | // Extract 8 x 128-bit lanes from 4 x 256-bit registers |
311 | 0 | __m128i xmm_a0 = _mm256_castsi256_si128(ymm_crc0); |
312 | 0 | __m128i xmm_a1 = _mm256_extracti128_si256(ymm_crc0, 1); |
313 | 0 | __m128i xmm_a2 = _mm256_castsi256_si128(ymm_crc1); |
314 | 0 | __m128i xmm_a3 = _mm256_extracti128_si256(ymm_crc1, 1); |
315 | 0 | __m128i xmm_a4 = _mm256_castsi256_si128(ymm_crc2); |
316 | 0 | __m128i xmm_a5 = _mm256_extracti128_si256(ymm_crc2, 1); |
317 | 0 | __m128i xmm_a6 = _mm256_castsi256_si128(ymm_crc3); |
318 | 0 | __m128i xmm_a7 = _mm256_extracti128_si256(ymm_crc3, 1); |
319 | | |
320 | | // Fold 8 -> 4 using xmm_fold4 (fold by 64 bytes = gap between lane N and lane N+4) |
321 | 0 | __m128i x_low, x_high; |
322 | 0 | x_low = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x01); |
323 | 0 | x_high = _mm_clmulepi64_si128(xmm_a0, xmm_fold4, 0x10); |
324 | 0 | *xmm_crc0 = z128_xor3_epi64(x_low, x_high, xmm_a4); |
325 | |
|
326 | 0 | x_low = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x01); |
327 | 0 | x_high = _mm_clmulepi64_si128(xmm_a1, xmm_fold4, 0x10); |
328 | 0 | *xmm_crc1 = z128_xor3_epi64(x_low, x_high, xmm_a5); |
329 | |
|
330 | 0 | x_low = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x01); |
331 | 0 | x_high = _mm_clmulepi64_si128(xmm_a2, xmm_fold4, 0x10); |
332 | 0 | *xmm_crc2 = z128_xor3_epi64(x_low, x_high, xmm_a6); |
333 | |
|
334 | 0 | x_low = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x01); |
335 | 0 | x_high = _mm_clmulepi64_si128(xmm_a3, xmm_fold4, 0x10); |
336 | 0 | *xmm_crc3 = z128_xor3_epi64(x_low, x_high, xmm_a7); |
337 | 0 | } |
338 | | #endif |
339 | | |
340 | | /* Chorba folding algorithm implemented from https://arxiv.org/abs/2412.16398 |
341 | | * We interleave the PCLMUL-based folds with 8x scaled generator polynomial copies; we read |
342 | | * 8x QWORDS and then XOR them into the stream at the following offsets: 6, 9, 10, 16, 20, 22, |
343 | | * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper as "generator_64_bits_unrolled_8" */ |
344 | | #if !defined(X86_VPCLMULQDQ) |
345 | | static inline void fold_block_chorba(const uint8_t **src, uint8_t **dst, size_t *len, __m128i *xmm_crc0, |
346 | | __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, |
347 | 23.6k | const __m128i xmm_fold4, const int COPY) { |
348 | 23.6k | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; |
349 | | |
350 | 23.6k | #ifndef __AVX512VL__ |
351 | 23.6k | if (!COPY) |
352 | 5.69k | return; |
353 | 17.9k | #endif |
354 | 985k | while (*len >= 512 + 64 + (16 * 8)) { |
355 | 967k | __m128i chorba8 = _mm_load_si128((__m128i *)*src); |
356 | 967k | __m128i chorba7 = _mm_load_si128((__m128i *)*src + 1); |
357 | 967k | __m128i chorba6 = _mm_load_si128((__m128i *)*src + 2); |
358 | 967k | __m128i chorba5 = _mm_load_si128((__m128i *)*src + 3); |
359 | 967k | __m128i chorba4 = _mm_load_si128((__m128i *)*src + 4); |
360 | 967k | __m128i chorba3 = _mm_load_si128((__m128i *)*src + 5); |
361 | 967k | __m128i chorba2 = _mm_load_si128((__m128i *)*src + 6); |
362 | 967k | __m128i chorba1 = _mm_load_si128((__m128i *)*src + 7); |
363 | 967k | if (COPY) { |
364 | 967k | _mm_storeu_si128((__m128i *)*dst, chorba8); |
365 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, chorba7); |
366 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, chorba6); |
367 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, chorba5); |
368 | 967k | _mm_storeu_si128((__m128i *)*dst + 4, chorba4); |
369 | 967k | _mm_storeu_si128((__m128i *)*dst + 5, chorba3); |
370 | 967k | _mm_storeu_si128((__m128i *)*dst + 6, chorba2); |
371 | 967k | _mm_storeu_si128((__m128i *)*dst + 7, chorba1); |
372 | 967k | *dst += 16 * 8; |
373 | 967k | } |
374 | | |
375 | 967k | chorba2 = _mm_xor_si128(chorba2, chorba8); |
376 | 967k | chorba1 = _mm_xor_si128(chorba1, chorba7); |
377 | 967k | *src += 16 * 8; |
378 | 967k | *len -= 16 * 8; |
379 | | |
380 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src); |
381 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 1); |
382 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 2); |
383 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 3); |
384 | | |
385 | 967k | fold_state_12(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); |
386 | 967k | if (COPY) { |
387 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
388 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
389 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
390 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
391 | 967k | *dst += 64; |
392 | 967k | } |
393 | | |
394 | 967k | *xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, *xmm_crc0); |
395 | 967k | *xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), *xmm_crc1); |
396 | 967k | *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, *xmm_crc2); |
397 | 967k | *xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, *xmm_crc3); |
398 | | |
399 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 4); |
400 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 5); |
401 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 6); |
402 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 7); |
403 | | |
404 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
405 | 967k | if (COPY) { |
406 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
407 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
408 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
409 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
410 | 967k | *dst += 64; |
411 | 967k | } |
412 | | |
413 | 967k | *xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, *xmm_crc0); |
414 | 967k | *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, *xmm_crc1); |
415 | 967k | *xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), *xmm_crc2); |
416 | 967k | *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), *xmm_crc3); |
417 | | |
418 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 8); |
419 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 9); |
420 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 10); |
421 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 11); |
422 | | |
423 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
424 | 967k | if (COPY) { |
425 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
426 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
427 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
428 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
429 | 967k | *dst += 64; |
430 | 967k | } |
431 | | |
432 | 967k | *xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, *xmm_crc0); |
433 | 967k | *xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), *xmm_crc1); |
434 | 967k | *xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, *xmm_crc2); |
435 | 967k | *xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, *xmm_crc3); |
436 | | |
437 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 12); |
438 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 13); |
439 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 14); |
440 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 15); |
441 | | |
442 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
443 | 967k | if (COPY) { |
444 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
445 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
446 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
447 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
448 | 967k | *dst += 64; |
449 | 967k | } |
450 | | |
451 | 967k | *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), *xmm_crc0); |
452 | 967k | *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, *xmm_crc1); |
453 | 967k | *xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), *xmm_crc2); |
454 | 967k | *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), *xmm_crc3); |
455 | | |
456 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 16); |
457 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 17); |
458 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 18); |
459 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 19); |
460 | | |
461 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
462 | 967k | if (COPY) { |
463 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
464 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
465 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
466 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
467 | 967k | *dst += 64; |
468 | 967k | } |
469 | | |
470 | 967k | *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), *xmm_crc0); |
471 | 967k | *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, *xmm_crc1); |
472 | 967k | *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, *xmm_crc2); |
473 | 967k | *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), *xmm_crc3); |
474 | | |
475 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 20); |
476 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 21); |
477 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 22); |
478 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 23); |
479 | | |
480 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
481 | 967k | if (COPY) { |
482 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
483 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
484 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
485 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
486 | 967k | *dst += 64; |
487 | 967k | } |
488 | | |
489 | 967k | *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), *xmm_crc0); |
490 | 967k | *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, *xmm_crc1); |
491 | 967k | *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, *xmm_crc2); |
492 | 967k | *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), *xmm_crc3); |
493 | | |
494 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 24); |
495 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 25); |
496 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 26); |
497 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 27); |
498 | | |
499 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
500 | 967k | if (COPY) { |
501 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
502 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
503 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
504 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
505 | 967k | *dst += 64; |
506 | 967k | } |
507 | | |
508 | 967k | *xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), *xmm_crc0); |
509 | 967k | *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, *xmm_crc1); |
510 | 967k | *xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, *xmm_crc2); |
511 | 967k | *xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), *xmm_crc3); |
512 | | |
513 | 967k | xmm_t0 = _mm_load_si128((__m128i *)*src + 28); |
514 | 967k | xmm_t1 = _mm_load_si128((__m128i *)*src + 29); |
515 | 967k | xmm_t2 = _mm_load_si128((__m128i *)*src + 30); |
516 | 967k | xmm_t3 = _mm_load_si128((__m128i *)*src + 31); |
517 | | |
518 | 967k | fold_state_4(xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_fold4); |
519 | 967k | if (COPY) { |
520 | 967k | _mm_storeu_si128((__m128i *)*dst, xmm_t0); |
521 | 967k | _mm_storeu_si128((__m128i *)*dst + 1, xmm_t1); |
522 | 967k | _mm_storeu_si128((__m128i *)*dst + 2, xmm_t2); |
523 | 967k | _mm_storeu_si128((__m128i *)*dst + 3, xmm_t3); |
524 | 967k | *dst += 64; |
525 | 967k | } |
526 | | |
527 | 967k | *xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, *xmm_crc0); |
528 | 967k | *xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, *xmm_crc1); |
529 | 967k | *xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), *xmm_crc2); |
530 | 967k | *xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, *xmm_crc3); |
531 | | |
532 | 967k | *len -= 512; |
533 | 967k | *src += 512; |
534 | 967k | } |
535 | 17.9k | } |
536 | | #endif |
537 | | |
538 | | Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, |
539 | 7.06M | const int COPY) { |
540 | 7.06M | size_t copy_len = len; |
541 | 7.06M | if (len >= 16) { |
542 | | /* Calculate 16-byte alignment offset */ |
543 | 1.37M | uintptr_t align_diff = ALIGN_DIFF(src, 16); |
544 | | |
545 | | /* If total length is less than (alignment bytes + 16), use the faster small method. |
546 | | * Handles both initially small buffers and cases where alignment would leave < 16 bytes */ |
547 | 1.37M | copy_len = len < align_diff + 16 ? len : align_diff; |
548 | 1.37M | } |
549 | | |
550 | 7.06M | if (copy_len > 0) { |
551 | 6.60M | crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY); |
552 | 6.60M | src += copy_len; |
553 | 6.60M | len -= copy_len; |
554 | 6.60M | if (COPY) { |
555 | 4.17k | dst += copy_len; |
556 | 4.17k | } |
557 | 6.60M | } |
558 | | |
559 | 7.06M | if (len == 0) |
560 | 5.72M | return crc; |
561 | | |
562 | 1.34M | const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
563 | | |
564 | 1.34M | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; |
565 | 1.34M | __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); |
566 | 1.34M | __m128i xmm_crc1 = _mm_setzero_si128(); |
567 | 1.34M | __m128i xmm_crc2 = _mm_setzero_si128(); |
568 | 1.34M | __m128i xmm_crc3 = _mm_setzero_si128(); |
569 | | |
570 | 1.34M | if (crc != 0) { |
571 | | // Process the first 16 bytes and handle initial CRC |
572 | 1.11M | len -= 16; |
573 | 1.11M | xmm_t0 = _mm_load_si128((__m128i *)src); |
574 | 1.11M | src += 16; |
575 | | |
576 | 1.11M | fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
577 | 1.11M | if (COPY) { |
578 | 15.2k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
579 | 15.2k | dst += 16; |
580 | 15.2k | } |
581 | 1.11M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); |
582 | 1.11M | } |
583 | | |
584 | | #if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) |
585 | | /* 512-bit VPCLMULQDQ path requires AVX-512F */ |
586 | 0 | if (len >= 256) |
587 | 0 | fold_block_16(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, COPY); |
588 | | #elif defined(X86_VPCLMULQDQ) |
589 | | /* 256-bit VPCLMULQDQ path (doesn't require AVX-512F) */ |
590 | 0 | if (len >= 128) |
591 | 0 | fold_block_8(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY); |
592 | | #else |
593 | | /* Chorba algorithm for PCLMULQDQ path (when VPCLMULQDQ not available) */ |
594 | 1.34M | if (len >= 512 + 64 + (16 * 8)) |
595 | 23.6k | fold_block_chorba(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY); |
596 | | #endif /* X86_VPCLMULQDQ */ |
597 | | |
598 | 9.45M | while (len >= 64) { |
599 | 8.11M | len -= 64; |
600 | 8.11M | xmm_t0 = _mm_load_si128((__m128i *)src); |
601 | 8.11M | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
602 | 8.11M | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
603 | 8.11M | xmm_t3 = _mm_load_si128((__m128i *)src + 3); |
604 | 8.11M | src += 64; |
605 | | |
606 | 8.11M | fold_state_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
607 | 8.11M | if (COPY) { |
608 | 49.8k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
609 | 49.8k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
610 | 49.8k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
611 | 49.8k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
612 | 49.8k | dst += 64; |
613 | 49.8k | } |
614 | | |
615 | 8.11M | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); |
616 | 8.11M | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); |
617 | 8.11M | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); |
618 | 8.11M | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); |
619 | 8.11M | } |
620 | | |
621 | | /* |
622 | | * len = num bytes left - 64 |
623 | | */ |
624 | 1.34M | if (len >= 48) { |
625 | 96.1k | len -= 48; |
626 | | |
627 | 96.1k | xmm_t0 = _mm_load_si128((__m128i *)src); |
628 | 96.1k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
629 | 96.1k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
630 | 96.1k | src += 48; |
631 | | |
632 | 96.1k | fold_state_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
633 | 96.1k | if (COPY) { |
634 | 13.2k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
635 | 13.2k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
636 | 13.2k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
637 | 13.2k | dst += 48; |
638 | 13.2k | } |
639 | | |
640 | 96.1k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); |
641 | 96.1k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); |
642 | 96.1k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); |
643 | 1.24M | } else if (len >= 32) { |
644 | 194k | len -= 32; |
645 | | |
646 | 194k | xmm_t0 = _mm_load_si128((__m128i *)src); |
647 | 194k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
648 | 194k | src += 32; |
649 | | |
650 | 194k | fold_state_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
651 | 194k | if (COPY) { |
652 | 4.49k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
653 | 4.49k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
654 | 4.49k | dst += 32; |
655 | 4.49k | } |
656 | | |
657 | 194k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); |
658 | 194k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); |
659 | 1.05M | } else if (len >= 16) { |
660 | 231k | len -= 16; |
661 | 231k | xmm_t0 = _mm_load_si128((__m128i *)src); |
662 | 231k | src += 16; |
663 | | |
664 | 231k | fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
665 | 231k | if (COPY) { |
666 | 3.09k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
667 | 3.09k | dst += 16; |
668 | 3.09k | } |
669 | | |
670 | 231k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
671 | 231k | } |
672 | | |
673 | 1.34M | const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e); |
674 | 1.34M | const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641); |
675 | | |
676 | | /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */ |
677 | 1.34M | __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01); |
678 | 1.34M | __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10); |
679 | 1.34M | xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0); |
680 | | |
681 | 1.34M | __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01); |
682 | 1.34M | __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10); |
683 | 1.34M | xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1); |
684 | | |
685 | 1.34M | __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01); |
686 | 1.34M | __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10); |
687 | 1.34M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2); |
688 | | |
689 | | /* Fold remaining bytes into the 128-bit state */ |
690 | 1.34M | if (len) { |
691 | 904k | const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); |
692 | 904k | const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
693 | | |
694 | | /* Create masks to shift bytes for partial input */ |
695 | 904k | __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16)); |
696 | 904k | __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3); |
697 | | |
698 | | /* Shift out bytes from crc3 to make space for new data */ |
699 | 904k | __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl); |
700 | 904k | xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr); |
701 | | |
702 | | /* Insert the partial input into crc3 */ |
703 | | #if defined(__AVX512BW__) && defined(__AVX512VL__) |
704 | | __mmask16 k = (1 << len) - 1; |
705 | | __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src); |
706 | 0 | if (COPY) { |
707 | 0 | _mm_mask_storeu_epi8(dst, k, xmm_crc_part); |
708 | 0 | } |
709 | | #else |
710 | | __m128i xmm_crc_part = _mm_setzero_si128(); |
711 | | memcpy(&xmm_crc_part, src, len); |
712 | 904k | if (COPY) { |
713 | 11.6k | memcpy(dst, src, len); |
714 | 11.6k | } |
715 | | #endif |
716 | 904k | __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl); |
717 | 904k | xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned); |
718 | | |
719 | | /* Fold the bytes that were shifted out back into crc3 */ |
720 | 904k | __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01); |
721 | 904k | __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10); |
722 | 904k | xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high); |
723 | 904k | } |
724 | | |
725 | | /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */ |
726 | 1.34M | __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00); |
727 | 1.34M | __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10); |
728 | | |
729 | 1.34M | x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf); |
730 | 1.34M | x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3); |
731 | | |
732 | 1.34M | __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01); |
733 | 1.34M | __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10); |
734 | | |
735 | 1.34M | crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2)); |
736 | | |
737 | 1.34M | return ~crc; |
738 | 7.06M | } crc32_pclmulqdq.c:crc32_copy_impl Line | Count | Source | 539 | 7.06M | const int COPY) { | 540 | 7.06M | size_t copy_len = len; | 541 | 7.06M | if (len >= 16) { | 542 | | /* Calculate 16-byte alignment offset */ | 543 | 1.37M | uintptr_t align_diff = ALIGN_DIFF(src, 16); | 544 | | | 545 | | /* If total length is less than (alignment bytes + 16), use the faster small method. | 546 | | * Handles both initially small buffers and cases where alignment would leave < 16 bytes */ | 547 | 1.37M | copy_len = len < align_diff + 16 ? len : align_diff; | 548 | 1.37M | } | 549 | | | 550 | 7.06M | if (copy_len > 0) { | 551 | 6.60M | crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY); | 552 | 6.60M | src += copy_len; | 553 | 6.60M | len -= copy_len; | 554 | 6.60M | if (COPY) { | 555 | 4.17k | dst += copy_len; | 556 | 4.17k | } | 557 | 6.60M | } | 558 | | | 559 | 7.06M | if (len == 0) | 560 | 5.72M | return crc; | 561 | | | 562 | 1.34M | const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); | 563 | | | 564 | 1.34M | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; | 565 | 1.34M | __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); | 566 | 1.34M | __m128i xmm_crc1 = _mm_setzero_si128(); | 567 | 1.34M | __m128i xmm_crc2 = _mm_setzero_si128(); | 568 | 1.34M | __m128i xmm_crc3 = _mm_setzero_si128(); | 569 | | | 570 | 1.34M | if (crc != 0) { | 571 | | // Process the first 16 bytes and handle initial CRC | 572 | 1.11M | len -= 16; | 573 | 1.11M | xmm_t0 = _mm_load_si128((__m128i *)src); | 574 | 1.11M | src += 16; | 575 | | | 576 | 1.11M | fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 577 | 1.11M | if (COPY) { | 578 | 15.2k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 579 | 15.2k | dst += 16; | 580 | 15.2k | } | 581 | 1.11M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); | 582 | 1.11M | } | 583 | | | 584 | | #if defined(X86_VPCLMULQDQ) && defined(__AVX512F__) | 585 | | /* 512-bit VPCLMULQDQ path requires AVX-512F */ | 586 | | if (len >= 256) | 587 | | fold_block_16(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, COPY); | 588 | | #elif defined(X86_VPCLMULQDQ) | 589 | | /* 256-bit VPCLMULQDQ path (doesn't require AVX-512F) */ | 590 | | if (len >= 128) | 591 | | fold_block_8(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY); | 592 | | #else | 593 | | /* Chorba algorithm for PCLMULQDQ path (when VPCLMULQDQ not available) */ | 594 | 1.34M | if (len >= 512 + 64 + (16 * 8)) | 595 | 23.6k | fold_block_chorba(&src, &dst, &len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4, COPY); | 596 | 1.34M | #endif /* X86_VPCLMULQDQ */ | 597 | | | 598 | 9.45M | while (len >= 64) { | 599 | 8.11M | len -= 64; | 600 | 8.11M | xmm_t0 = _mm_load_si128((__m128i *)src); | 601 | 8.11M | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 602 | 8.11M | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 603 | 8.11M | xmm_t3 = _mm_load_si128((__m128i *)src + 3); | 604 | 8.11M | src += 64; | 605 | | | 606 | 8.11M | fold_state_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 607 | 8.11M | if (COPY) { | 608 | 49.8k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 609 | 49.8k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 610 | 49.8k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 611 | 49.8k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 612 | 49.8k | dst += 64; | 613 | 49.8k | } | 614 | | | 615 | 8.11M | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); | 616 | 8.11M | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); | 617 | 8.11M | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); | 618 | 8.11M | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); | 619 | 8.11M | } | 620 | | | 621 | | /* | 622 | | * len = num bytes left - 64 | 623 | | */ | 624 | 1.34M | if (len >= 48) { | 625 | 96.1k | len -= 48; | 626 | | | 627 | 96.1k | xmm_t0 = _mm_load_si128((__m128i *)src); | 628 | 96.1k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 629 | 96.1k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 630 | 96.1k | src += 48; | 631 | | | 632 | 96.1k | fold_state_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 633 | 96.1k | if (COPY) { | 634 | 13.2k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 635 | 13.2k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 636 | 13.2k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 637 | 13.2k | dst += 48; | 638 | 13.2k | } | 639 | | | 640 | 96.1k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); | 641 | 96.1k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); | 642 | 96.1k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); | 643 | 1.24M | } else if (len >= 32) { | 644 | 194k | len -= 32; | 645 | | | 646 | 194k | xmm_t0 = _mm_load_si128((__m128i *)src); | 647 | 194k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 648 | 194k | src += 32; | 649 | | | 650 | 194k | fold_state_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 651 | 194k | if (COPY) { | 652 | 4.49k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 653 | 4.49k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 654 | 4.49k | dst += 32; | 655 | 4.49k | } | 656 | | | 657 | 194k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); | 658 | 194k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); | 659 | 1.05M | } else if (len >= 16) { | 660 | 231k | len -= 16; | 661 | 231k | xmm_t0 = _mm_load_si128((__m128i *)src); | 662 | 231k | src += 16; | 663 | | | 664 | 231k | fold_state_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 665 | 231k | if (COPY) { | 666 | 3.09k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 667 | 3.09k | dst += 16; | 668 | 3.09k | } | 669 | | | 670 | 231k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 671 | 231k | } | 672 | | | 673 | 1.34M | const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e); | 674 | 1.34M | const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641); | 675 | | | 676 | | /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */ | 677 | 1.34M | __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01); | 678 | 1.34M | __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10); | 679 | 1.34M | xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0); | 680 | | | 681 | 1.34M | __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01); | 682 | 1.34M | __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10); | 683 | 1.34M | xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1); | 684 | | | 685 | 1.34M | __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01); | 686 | 1.34M | __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10); | 687 | 1.34M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2); | 688 | | | 689 | | /* Fold remaining bytes into the 128-bit state */ | 690 | 1.34M | if (len) { | 691 | 904k | const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); | 692 | 904k | const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); | 693 | | | 694 | | /* Create masks to shift bytes for partial input */ | 695 | 904k | __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16)); | 696 | 904k | __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3); | 697 | | | 698 | | /* Shift out bytes from crc3 to make space for new data */ | 699 | 904k | __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl); | 700 | 904k | xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr); | 701 | | | 702 | | /* Insert the partial input into crc3 */ | 703 | | #if defined(__AVX512BW__) && defined(__AVX512VL__) | 704 | | __mmask16 k = (1 << len) - 1; | 705 | | __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src); | 706 | | if (COPY) { | 707 | | _mm_mask_storeu_epi8(dst, k, xmm_crc_part); | 708 | | } | 709 | | #else | 710 | 904k | __m128i xmm_crc_part = _mm_setzero_si128(); | 711 | 904k | memcpy(&xmm_crc_part, src, len); | 712 | 904k | if (COPY) { | 713 | 11.6k | memcpy(dst, src, len); | 714 | 11.6k | } | 715 | 904k | #endif | 716 | 904k | __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl); | 717 | 904k | xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned); | 718 | | | 719 | | /* Fold the bytes that were shifted out back into crc3 */ | 720 | 904k | __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01); | 721 | 904k | __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10); | 722 | 904k | xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high); | 723 | 904k | } | 724 | | | 725 | | /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */ | 726 | 1.34M | __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00); | 727 | 1.34M | __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10); | 728 | | | 729 | 1.34M | x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf); | 730 | 1.34M | x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3); | 731 | | | 732 | 1.34M | __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01); | 733 | 1.34M | __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10); | 734 | | | 735 | 1.34M | crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2)); | 736 | | | 737 | 1.34M | return ~crc; | 738 | 7.06M | } |
Unexecuted instantiation: crc32_vpclmulqdq_avx2.c:crc32_copy_impl Unexecuted instantiation: crc32_vpclmulqdq_avx512.c:crc32_copy_impl |