/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq_tpl.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ |
3 | | * instruction. |
4 | | * |
5 | | * A white paper describing this algorithm can be found at: |
6 | | * doc/crc-pclmulqdq.pdf |
7 | | * |
8 | | * Copyright (C) 2013 Intel Corporation. All rights reserved. |
9 | | * Copyright (C) 2016 Marian Beermann (support for initial value) |
10 | | * Authors: |
11 | | * Wajdi Feghali <wajdi.k.feghali@intel.com> |
12 | | * Jim Guilford <james.guilford@intel.com> |
13 | | * Vinodh Gopal <vinodh.gopal@intel.com> |
14 | | * Erdinc Ozturk <erdinc.ozturk@intel.com> |
15 | | * Jim Kukunas <james.t.kukunas@linux.intel.com> |
16 | | * |
17 | | * For conditions of distribution and use, see copyright notice in zlib.h |
18 | | */ |
19 | | |
20 | | #ifdef COPY |
21 | 16.8k | Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { |
22 | | #else |
23 | 1.43M | Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { |
24 | 1.43M | #endif |
25 | 1.43M | unsigned long algn_diff; |
26 | 1.43M | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; |
27 | 1.43M | __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; |
28 | 1.43M | __m128i xmm_crc_part = _mm_setzero_si128(); |
29 | 1.43M | char ALIGNED_(16) partial_buf[16] = { 0 }; |
30 | | #ifndef COPY |
31 | | __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); |
32 | | int32_t first = init_crc != 0; |
33 | | |
34 | | /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed |
35 | | * for the aligning load that occurs. If there's an initial CRC, to carry it forward through |
36 | | * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be |
37 | | * up to 15 bytes + one full vector load. */ |
38 | | assert(len >= 16 || first == 0); |
39 | | #endif |
40 | 1.43M | crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
41 | | |
42 | 1.45M | if (len < 16) { |
43 | 1.75k | if (len == 0) |
44 | 0 | return; |
45 | | |
46 | 1.75k | memcpy(partial_buf, src, len); |
47 | 1.75k | xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); |
48 | | #ifdef COPY |
49 | | memcpy(dst, partial_buf, len); |
50 | | #endif |
51 | 1.75k | goto partial; |
52 | 1.75k | } |
53 | | |
54 | 1.45M | algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; |
55 | 1.45M | if (algn_diff) { |
56 | 1.17M | xmm_crc_part = _mm_loadu_si128((__m128i *)src); |
57 | | #ifdef COPY |
58 | | _mm_storeu_si128((__m128i *)dst, xmm_crc_part); |
59 | | dst += algn_diff; |
60 | | #else |
61 | 1.17M | XOR_INITIAL128(xmm_crc_part); |
62 | | |
63 | 1.17M | if (algn_diff < 4 && init_crc != 0) { |
64 | 104k | xmm_t0 = xmm_crc_part; |
65 | 104k | if (len >= 32) { |
66 | 39.6k | xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); |
67 | 39.6k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
68 | 39.6k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
69 | 64.4k | } else { |
70 | 64.4k | memcpy(partial_buf, src + 16, len - 16); |
71 | 64.4k | xmm_crc_part = _mm_load_si128((__m128i*)partial_buf); |
72 | 64.4k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
73 | 64.4k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
74 | 64.4k | src += 16; |
75 | 64.4k | len -= 16; |
76 | | #ifdef COPY |
77 | | dst -= algn_diff; |
78 | | #endif |
79 | 64.4k | goto partial; |
80 | 64.4k | } |
81 | | |
82 | 39.6k | src += 16; |
83 | 39.6k | len -= 16; |
84 | 39.6k | } |
85 | 1.10M | #endif |
86 | | |
87 | 1.10M | partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); |
88 | | |
89 | 1.10M | src += algn_diff; |
90 | 1.10M | len -= algn_diff; |
91 | 1.10M | } |
92 | | |
93 | | #ifdef X86_VPCLMULQDQ |
94 | 0 | if (len >= 256) { |
95 | | #ifdef COPY |
96 | | size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); |
97 | | dst += n; |
98 | | #else |
99 | | size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, |
100 | | xmm_initial, first); |
101 | | first = 0; |
102 | | #endif |
103 | 0 | len -= n; |
104 | 0 | src += n; |
105 | 0 | } |
106 | | #endif |
107 | |
|
108 | 0 | #ifndef WITHOUT_CHORBA |
109 | | /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 |
110 | | * We interleave the PCLMUL-base folds with 8x scaled generator |
111 | | * polynomial copies; we read 8x QWORDS and then XOR them into |
112 | | * the stream at the following offsets: 6, 9, 10, 16, 20, 22, |
113 | | * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper |
114 | | * as "generator_64_bits_unrolled_8" */ |
115 | 2.63M | while (len >= 512 + 64 + 16*8) { |
116 | 1.24M | __m128i chorba8 = _mm_loadu_si128((__m128i *)src); |
117 | 1.24M | __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1); |
118 | 1.24M | __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2); |
119 | 1.24M | __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3); |
120 | 1.24M | __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4); |
121 | 1.24M | __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5); |
122 | 1.24M | __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6); |
123 | 1.24M | __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7); |
124 | | #ifdef COPY |
125 | | _mm_storeu_si128((__m128i *)dst, chorba8); |
126 | | _mm_storeu_si128((__m128i *)dst + 1, chorba7); |
127 | | _mm_storeu_si128((__m128i *)dst + 2, chorba6); |
128 | | _mm_storeu_si128((__m128i *)dst + 3, chorba5); |
129 | | _mm_storeu_si128((__m128i *)dst + 4, chorba4); |
130 | | _mm_storeu_si128((__m128i *)dst + 5, chorba3); |
131 | | _mm_storeu_si128((__m128i *)dst + 6, chorba2); |
132 | | _mm_storeu_si128((__m128i *)dst + 7, chorba1); |
133 | | dst += 16*8; |
134 | | #else |
135 | 557k | XOR_INITIAL128(chorba8); |
136 | | #endif |
137 | 1.24M | chorba2 = _mm_xor_si128(chorba2, chorba8); |
138 | 1.24M | chorba1 = _mm_xor_si128(chorba1, chorba7); |
139 | 1.24M | src += 16*8; |
140 | 1.24M | len -= 16*8; |
141 | | |
142 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src); |
143 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 1); |
144 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 2); |
145 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 3); |
146 | | |
147 | 1.24M | fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
148 | | #ifdef COPY |
149 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
150 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
151 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
152 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
153 | | dst += 64; |
154 | | #endif |
155 | 1.24M | xmm_t0 = _mm_xor_si128(xmm_t0, chorba6); |
156 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8); |
157 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7); |
158 | 1.24M | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6); |
159 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
160 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
161 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
162 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
163 | | |
164 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 4); |
165 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 5); |
166 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 6); |
167 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 7); |
168 | | |
169 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
170 | | #ifdef COPY |
171 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
172 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
173 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
174 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
175 | | dst += 64; |
176 | | #endif |
177 | | |
178 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5); |
179 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5); |
180 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4); |
181 | 1.24M | xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3); |
182 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
183 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
184 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
185 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
186 | | |
187 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 8); |
188 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 9); |
189 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 10); |
190 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 11); |
191 | | |
192 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
193 | | #ifdef COPY |
194 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
195 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
196 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
197 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
198 | | dst += 64; |
199 | | #endif |
200 | | |
201 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8); |
202 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7); |
203 | 1.24M | xmm_t2 = _mm_xor_si128(xmm_t2, chorba6); |
204 | 1.24M | xmm_t3 = _mm_xor_si128(xmm_t3, chorba5); |
205 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
206 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
207 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
208 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
209 | | |
210 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 12); |
211 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 13); |
212 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 14); |
213 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 15); |
214 | | |
215 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
216 | | #ifdef COPY |
217 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
218 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
219 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
220 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
221 | | dst += 64; |
222 | | #endif |
223 | | |
224 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8); |
225 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7); |
226 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6); |
227 | 1.24M | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5); |
228 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
229 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
230 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
231 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
232 | | |
233 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 16); |
234 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 17); |
235 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 18); |
236 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 19); |
237 | | |
238 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
239 | | #ifdef COPY |
240 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
241 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
242 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
243 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
244 | | dst += 64; |
245 | | #endif |
246 | | |
247 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5); |
248 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5); |
249 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6); |
250 | 1.24M | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5); |
251 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
252 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
253 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
254 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
255 | | |
256 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 20); |
257 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 21); |
258 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 22); |
259 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 23); |
260 | | |
261 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
262 | | #ifdef COPY |
263 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
264 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
265 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
266 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
267 | | dst += 64; |
268 | | #endif |
269 | | |
270 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); |
271 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6); |
272 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5); |
273 | 1.24M | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); |
274 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
275 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
276 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
277 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
278 | | |
279 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 24); |
280 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 25); |
281 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 26); |
282 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 27); |
283 | | |
284 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
285 | | #ifdef COPY |
286 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
287 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
288 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
289 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
290 | | dst += 64; |
291 | | #endif |
292 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6); |
293 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5); |
294 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5); |
295 | 1.24M | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5); |
296 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
297 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
298 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
299 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
300 | | |
301 | 1.24M | xmm_t0 = _mm_loadu_si128((__m128i *)src + 28); |
302 | 1.24M | xmm_t1 = _mm_loadu_si128((__m128i *)src + 29); |
303 | 1.24M | xmm_t2 = _mm_loadu_si128((__m128i *)src + 30); |
304 | 1.24M | xmm_t3 = _mm_loadu_si128((__m128i *)src + 31); |
305 | | |
306 | 1.24M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
307 | | #ifdef COPY |
308 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
309 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
310 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
311 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
312 | | dst += 64; |
313 | | #endif |
314 | 1.24M | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4); |
315 | 1.24M | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3); |
316 | 1.24M | xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2); |
317 | 1.24M | xmm_t3 = _mm_xor_si128(xmm_t3, chorba1); |
318 | 1.24M | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); |
319 | 1.24M | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); |
320 | 1.24M | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); |
321 | 1.24M | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); |
322 | | |
323 | 1.24M | len -= 512; |
324 | 1.24M | src += 512; |
325 | 1.24M | } |
326 | 1.37M | #endif /* WITHOUT_CHORBA */ |
327 | | |
328 | 1.95M | while (len >= 64) { |
329 | 561k | len -= 64; |
330 | 561k | xmm_t0 = _mm_load_si128((__m128i *)src); |
331 | 561k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
332 | 561k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
333 | 561k | xmm_t3 = _mm_load_si128((__m128i *)src + 3); |
334 | 561k | src += 64; |
335 | | |
336 | 561k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
337 | | #ifdef COPY |
338 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
339 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
340 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
341 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
342 | | dst += 64; |
343 | | #else |
344 | 522k | XOR_INITIAL128(xmm_t0); |
345 | | #endif |
346 | | |
347 | 561k | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); |
348 | 561k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); |
349 | 561k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); |
350 | 561k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); |
351 | 561k | } |
352 | | |
353 | | /* |
354 | | * len = num bytes left - 64 |
355 | | */ |
356 | 1.38M | if (len >= 48) { |
357 | 178k | len -= 48; |
358 | | |
359 | 178k | xmm_t0 = _mm_load_si128((__m128i *)src); |
360 | 178k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
361 | 178k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
362 | 178k | src += 48; |
363 | | #ifdef COPY |
364 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
365 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
366 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
367 | | dst += 48; |
368 | | #else |
369 | 176k | XOR_INITIAL128(xmm_t0); |
370 | | #endif |
371 | 178k | fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
372 | | |
373 | 178k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); |
374 | 178k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); |
375 | 178k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); |
376 | 1.20M | } else if (len >= 32) { |
377 | 195k | len -= 32; |
378 | | |
379 | 195k | xmm_t0 = _mm_load_si128((__m128i *)src); |
380 | 195k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
381 | 195k | src += 32; |
382 | | #ifdef COPY |
383 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
384 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
385 | | dst += 32; |
386 | | #else |
387 | 194k | XOR_INITIAL128(xmm_t0); |
388 | | #endif |
389 | 195k | fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
390 | | |
391 | 195k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); |
392 | 195k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); |
393 | 1.01M | } else if (len >= 16) { |
394 | 896k | len -= 16; |
395 | 896k | xmm_t0 = _mm_load_si128((__m128i *)src); |
396 | 896k | src += 16; |
397 | | #ifdef COPY |
398 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
399 | | dst += 16; |
400 | | #else |
401 | 894k | XOR_INITIAL128(xmm_t0); |
402 | | #endif |
403 | 896k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
404 | | |
405 | 896k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
406 | 896k | } |
407 | | |
408 | 1.45M | partial: |
409 | 1.45M | if (len) { |
410 | 1.18M | memcpy(&xmm_crc_part, src, len); |
411 | | #ifdef COPY |
412 | | _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part); |
413 | | memcpy(dst, partial_buf, len); |
414 | | #endif |
415 | 1.18M | partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); |
416 | 1.18M | } |
417 | | |
418 | 1.45M | crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
419 | 1.45M | } crc32_fold_pclmulqdq_copy Line | Count | Source | 21 | 16.8k | Z_INTERNAL void CRC32_FOLD_COPY(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { | 22 | | #else | 23 | | Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { | 24 | | #endif | 25 | 16.8k | unsigned long algn_diff; | 26 | 16.8k | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; | 27 | 16.8k | __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; | 28 | 16.8k | __m128i xmm_crc_part = _mm_setzero_si128(); | 29 | 16.8k | char ALIGNED_(16) partial_buf[16] = { 0 }; | 30 | | #ifndef COPY | 31 | | __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); | 32 | | int32_t first = init_crc != 0; | 33 | | | 34 | | /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed | 35 | | * for the aligning load that occurs. If there's an initial CRC, to carry it forward through | 36 | | * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be | 37 | | * up to 15 bytes + one full vector load. */ | 38 | | assert(len >= 16 || first == 0); | 39 | | #endif | 40 | 16.8k | crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 41 | | | 42 | 16.8k | if (len < 16) { | 43 | 900 | if (len == 0) | 44 | 0 | return; | 45 | | | 46 | 900 | memcpy(partial_buf, src, len); | 47 | 900 | xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); | 48 | 900 | #ifdef COPY | 49 | 900 | memcpy(dst, partial_buf, len); | 50 | 900 | #endif | 51 | 900 | goto partial; | 52 | 900 | } | 53 | | | 54 | 15.9k | algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; | 55 | 15.9k | if (algn_diff) { | 56 | 1.78k | xmm_crc_part = _mm_loadu_si128((__m128i *)src); | 57 | 1.78k | #ifdef COPY | 58 | 1.78k | _mm_storeu_si128((__m128i *)dst, xmm_crc_part); | 59 | 1.78k | dst += algn_diff; | 60 | | #else | 61 | | XOR_INITIAL128(xmm_crc_part); | 62 | | | 63 | | if (algn_diff < 4 && init_crc != 0) { | 64 | | xmm_t0 = xmm_crc_part; | 65 | | if (len >= 32) { | 66 | | xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); | 67 | | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 68 | | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 69 | | } else { | 70 | | memcpy(partial_buf, src + 16, len - 16); | 71 | | xmm_crc_part = _mm_load_si128((__m128i*)partial_buf); | 72 | | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 73 | | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 74 | | src += 16; | 75 | | len -= 16; | 76 | | #ifdef COPY | 77 | | dst -= algn_diff; | 78 | | #endif | 79 | | goto partial; | 80 | | } | 81 | | | 82 | | src += 16; | 83 | | len -= 16; | 84 | | } | 85 | | #endif | 86 | | | 87 | 1.78k | partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); | 88 | | | 89 | 1.78k | src += algn_diff; | 90 | 1.78k | len -= algn_diff; | 91 | 1.78k | } | 92 | | | 93 | | #ifdef X86_VPCLMULQDQ | 94 | | if (len >= 256) { | 95 | | #ifdef COPY | 96 | | size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); | 97 | | dst += n; | 98 | | #else | 99 | | size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, | 100 | | xmm_initial, first); | 101 | | first = 0; | 102 | | #endif | 103 | | len -= n; | 104 | | src += n; | 105 | | } | 106 | | #endif | 107 | | | 108 | 15.9k | #ifndef WITHOUT_CHORBA | 109 | | /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 | 110 | | * We interleave the PCLMUL-base folds with 8x scaled generator | 111 | | * polynomial copies; we read 8x QWORDS and then XOR them into | 112 | | * the stream at the following offsets: 6, 9, 10, 16, 20, 22, | 113 | | * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper | 114 | | * as "generator_64_bits_unrolled_8" */ | 115 | 700k | while (len >= 512 + 64 + 16*8) { | 116 | 684k | __m128i chorba8 = _mm_loadu_si128((__m128i *)src); | 117 | 684k | __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1); | 118 | 684k | __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2); | 119 | 684k | __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3); | 120 | 684k | __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4); | 121 | 684k | __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5); | 122 | 684k | __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6); | 123 | 684k | __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7); | 124 | 684k | #ifdef COPY | 125 | 684k | _mm_storeu_si128((__m128i *)dst, chorba8); | 126 | 684k | _mm_storeu_si128((__m128i *)dst + 1, chorba7); | 127 | 684k | _mm_storeu_si128((__m128i *)dst + 2, chorba6); | 128 | 684k | _mm_storeu_si128((__m128i *)dst + 3, chorba5); | 129 | 684k | _mm_storeu_si128((__m128i *)dst + 4, chorba4); | 130 | 684k | _mm_storeu_si128((__m128i *)dst + 5, chorba3); | 131 | 684k | _mm_storeu_si128((__m128i *)dst + 6, chorba2); | 132 | 684k | _mm_storeu_si128((__m128i *)dst + 7, chorba1); | 133 | 684k | dst += 16*8; | 134 | | #else | 135 | | XOR_INITIAL128(chorba8); | 136 | | #endif | 137 | 684k | chorba2 = _mm_xor_si128(chorba2, chorba8); | 138 | 684k | chorba1 = _mm_xor_si128(chorba1, chorba7); | 139 | 684k | src += 16*8; | 140 | 684k | len -= 16*8; | 141 | | | 142 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src); | 143 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 1); | 144 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 2); | 145 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 3); | 146 | | | 147 | 684k | fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 148 | 684k | #ifdef COPY | 149 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 150 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 151 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 152 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 153 | 684k | dst += 64; | 154 | 684k | #endif | 155 | 684k | xmm_t0 = _mm_xor_si128(xmm_t0, chorba6); | 156 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8); | 157 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7); | 158 | 684k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6); | 159 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 160 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 161 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 162 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 163 | | | 164 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 4); | 165 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 5); | 166 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 6); | 167 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 7); | 168 | | | 169 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 170 | 684k | #ifdef COPY | 171 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 172 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 173 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 174 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 175 | 684k | dst += 64; | 176 | 684k | #endif | 177 | | | 178 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5); | 179 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5); | 180 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4); | 181 | 684k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3); | 182 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 183 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 184 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 185 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 186 | | | 187 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 8); | 188 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 9); | 189 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 10); | 190 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 11); | 191 | | | 192 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 193 | 684k | #ifdef COPY | 194 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 195 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 196 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 197 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 198 | 684k | dst += 64; | 199 | 684k | #endif | 200 | | | 201 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8); | 202 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7); | 203 | 684k | xmm_t2 = _mm_xor_si128(xmm_t2, chorba6); | 204 | 684k | xmm_t3 = _mm_xor_si128(xmm_t3, chorba5); | 205 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 206 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 207 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 208 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 209 | | | 210 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 12); | 211 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 13); | 212 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 14); | 213 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 15); | 214 | | | 215 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 216 | 684k | #ifdef COPY | 217 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 218 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 219 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 220 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 221 | 684k | dst += 64; | 222 | 684k | #endif | 223 | | | 224 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8); | 225 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7); | 226 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6); | 227 | 684k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5); | 228 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 229 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 230 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 231 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 232 | | | 233 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 16); | 234 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 17); | 235 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 18); | 236 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 19); | 237 | | | 238 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 239 | 684k | #ifdef COPY | 240 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 241 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 242 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 243 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 244 | 684k | dst += 64; | 245 | 684k | #endif | 246 | | | 247 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5); | 248 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5); | 249 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6); | 250 | 684k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5); | 251 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 252 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 253 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 254 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 255 | | | 256 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 20); | 257 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 21); | 258 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 22); | 259 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 23); | 260 | | | 261 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 262 | 684k | #ifdef COPY | 263 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 264 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 265 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 266 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 267 | 684k | dst += 64; | 268 | 684k | #endif | 269 | | | 270 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); | 271 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6); | 272 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5); | 273 | 684k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); | 274 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 275 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 276 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 277 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 278 | | | 279 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 24); | 280 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 25); | 281 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 26); | 282 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 27); | 283 | | | 284 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 285 | 684k | #ifdef COPY | 286 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 287 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 288 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 289 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 290 | 684k | dst += 64; | 291 | 684k | #endif | 292 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6); | 293 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5); | 294 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5); | 295 | 684k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5); | 296 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 297 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 298 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 299 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 300 | | | 301 | 684k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 28); | 302 | 684k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 29); | 303 | 684k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 30); | 304 | 684k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 31); | 305 | | | 306 | 684k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 307 | 684k | #ifdef COPY | 308 | 684k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 309 | 684k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 310 | 684k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 311 | 684k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 312 | 684k | dst += 64; | 313 | 684k | #endif | 314 | 684k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4); | 315 | 684k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3); | 316 | 684k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2); | 317 | 684k | xmm_t3 = _mm_xor_si128(xmm_t3, chorba1); | 318 | 684k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 319 | 684k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 320 | 684k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 321 | 684k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 322 | | | 323 | 684k | len -= 512; | 324 | 684k | src += 512; | 325 | 684k | } | 326 | 15.9k | #endif /* WITHOUT_CHORBA */ | 327 | | | 328 | 54.4k | while (len >= 64) { | 329 | 38.4k | len -= 64; | 330 | 38.4k | xmm_t0 = _mm_load_si128((__m128i *)src); | 331 | 38.4k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 332 | 38.4k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 333 | 38.4k | xmm_t3 = _mm_load_si128((__m128i *)src + 3); | 334 | 38.4k | src += 64; | 335 | | | 336 | 38.4k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 337 | 38.4k | #ifdef COPY | 338 | 38.4k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 339 | 38.4k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 340 | 38.4k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 341 | 38.4k | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 342 | 38.4k | dst += 64; | 343 | | #else | 344 | | XOR_INITIAL128(xmm_t0); | 345 | | #endif | 346 | | | 347 | 38.4k | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); | 348 | 38.4k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); | 349 | 38.4k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); | 350 | 38.4k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); | 351 | 38.4k | } | 352 | | | 353 | | /* | 354 | | * len = num bytes left - 64 | 355 | | */ | 356 | 15.9k | if (len >= 48) { | 357 | 2.41k | len -= 48; | 358 | | | 359 | 2.41k | xmm_t0 = _mm_load_si128((__m128i *)src); | 360 | 2.41k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 361 | 2.41k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 362 | 2.41k | src += 48; | 363 | 2.41k | #ifdef COPY | 364 | 2.41k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 365 | 2.41k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 366 | 2.41k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 367 | 2.41k | dst += 48; | 368 | | #else | 369 | | XOR_INITIAL128(xmm_t0); | 370 | | #endif | 371 | 2.41k | fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 372 | | | 373 | 2.41k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); | 374 | 2.41k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); | 375 | 2.41k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); | 376 | 13.5k | } else if (len >= 32) { | 377 | 1.27k | len -= 32; | 378 | | | 379 | 1.27k | xmm_t0 = _mm_load_si128((__m128i *)src); | 380 | 1.27k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 381 | 1.27k | src += 32; | 382 | 1.27k | #ifdef COPY | 383 | 1.27k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 384 | 1.27k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 385 | 1.27k | dst += 32; | 386 | | #else | 387 | | XOR_INITIAL128(xmm_t0); | 388 | | #endif | 389 | 1.27k | fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 390 | | | 391 | 1.27k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); | 392 | 1.27k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); | 393 | 12.2k | } else if (len >= 16) { | 394 | 1.81k | len -= 16; | 395 | 1.81k | xmm_t0 = _mm_load_si128((__m128i *)src); | 396 | 1.81k | src += 16; | 397 | 1.81k | #ifdef COPY | 398 | 1.81k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 399 | 1.81k | dst += 16; | 400 | | #else | 401 | | XOR_INITIAL128(xmm_t0); | 402 | | #endif | 403 | 1.81k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 404 | | | 405 | 1.81k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 406 | 1.81k | } | 407 | | | 408 | 16.8k | partial: | 409 | 16.8k | if (len) { | 410 | 6.67k | memcpy(&xmm_crc_part, src, len); | 411 | 6.67k | #ifdef COPY | 412 | 6.67k | _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part); | 413 | 6.67k | memcpy(dst, partial_buf, len); | 414 | 6.67k | #endif | 415 | 6.67k | partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); | 416 | 6.67k | } | 417 | | | 418 | 16.8k | crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 419 | 16.8k | } |
Unexecuted instantiation: crc32_fold_vpclmulqdq_copy Line | Count | Source | 23 | 1.43M | Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { | 24 | 1.43M | #endif | 25 | 1.43M | unsigned long algn_diff; | 26 | 1.43M | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; | 27 | 1.43M | __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; | 28 | 1.43M | __m128i xmm_crc_part = _mm_setzero_si128(); | 29 | 1.43M | char ALIGNED_(16) partial_buf[16] = { 0 }; | 30 | 1.43M | #ifndef COPY | 31 | 1.43M | __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); | 32 | 1.43M | int32_t first = init_crc != 0; | 33 | | | 34 | | /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed | 35 | | * for the aligning load that occurs. If there's an initial CRC, to carry it forward through | 36 | | * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be | 37 | | * up to 15 bytes + one full vector load. */ | 38 | 1.43M | assert(len >= 16 || first == 0); | 39 | 1.43M | #endif | 40 | 1.43M | crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 41 | | | 42 | 1.43M | if (len < 16) { | 43 | 853 | if (len == 0) | 44 | 0 | return; | 45 | | | 46 | 853 | memcpy(partial_buf, src, len); | 47 | 853 | xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); | 48 | | #ifdef COPY | 49 | | memcpy(dst, partial_buf, len); | 50 | | #endif | 51 | 853 | goto partial; | 52 | 853 | } | 53 | | | 54 | 1.43M | algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; | 55 | 1.43M | if (algn_diff) { | 56 | 1.17M | xmm_crc_part = _mm_loadu_si128((__m128i *)src); | 57 | | #ifdef COPY | 58 | | _mm_storeu_si128((__m128i *)dst, xmm_crc_part); | 59 | | dst += algn_diff; | 60 | | #else | 61 | 1.17M | XOR_INITIAL128(xmm_crc_part); | 62 | | | 63 | 1.17M | if (algn_diff < 4 && init_crc != 0) { | 64 | 104k | xmm_t0 = xmm_crc_part; | 65 | 104k | if (len >= 32) { | 66 | 39.6k | xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); | 67 | 39.6k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 68 | 39.6k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 69 | 64.4k | } else { | 70 | 64.4k | memcpy(partial_buf, src + 16, len - 16); | 71 | 64.4k | xmm_crc_part = _mm_load_si128((__m128i*)partial_buf); | 72 | 64.4k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 73 | 64.4k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 74 | 64.4k | src += 16; | 75 | 64.4k | len -= 16; | 76 | | #ifdef COPY | 77 | | dst -= algn_diff; | 78 | | #endif | 79 | 64.4k | goto partial; | 80 | 64.4k | } | 81 | | | 82 | 39.6k | src += 16; | 83 | 39.6k | len -= 16; | 84 | 39.6k | } | 85 | 1.10M | #endif | 86 | | | 87 | 1.10M | partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); | 88 | | | 89 | 1.10M | src += algn_diff; | 90 | 1.10M | len -= algn_diff; | 91 | 1.10M | } | 92 | | | 93 | | #ifdef X86_VPCLMULQDQ | 94 | | if (len >= 256) { | 95 | | #ifdef COPY | 96 | | size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); | 97 | | dst += n; | 98 | | #else | 99 | | size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, | 100 | | xmm_initial, first); | 101 | | first = 0; | 102 | | #endif | 103 | | len -= n; | 104 | | src += n; | 105 | | } | 106 | | #endif | 107 | | | 108 | 1.37M | #ifndef WITHOUT_CHORBA | 109 | | /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 | 110 | | * We interleave the PCLMUL-base folds with 8x scaled generator | 111 | | * polynomial copies; we read 8x QWORDS and then XOR them into | 112 | | * the stream at the following offsets: 6, 9, 10, 16, 20, 22, | 113 | | * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper | 114 | | * as "generator_64_bits_unrolled_8" */ | 115 | 1.93M | while (len >= 512 + 64 + 16*8) { | 116 | 557k | __m128i chorba8 = _mm_loadu_si128((__m128i *)src); | 117 | 557k | __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1); | 118 | 557k | __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2); | 119 | 557k | __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3); | 120 | 557k | __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4); | 121 | 557k | __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5); | 122 | 557k | __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6); | 123 | 557k | __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7); | 124 | | #ifdef COPY | 125 | | _mm_storeu_si128((__m128i *)dst, chorba8); | 126 | | _mm_storeu_si128((__m128i *)dst + 1, chorba7); | 127 | | _mm_storeu_si128((__m128i *)dst + 2, chorba6); | 128 | | _mm_storeu_si128((__m128i *)dst + 3, chorba5); | 129 | | _mm_storeu_si128((__m128i *)dst + 4, chorba4); | 130 | | _mm_storeu_si128((__m128i *)dst + 5, chorba3); | 131 | | _mm_storeu_si128((__m128i *)dst + 6, chorba2); | 132 | | _mm_storeu_si128((__m128i *)dst + 7, chorba1); | 133 | | dst += 16*8; | 134 | | #else | 135 | 557k | XOR_INITIAL128(chorba8); | 136 | 557k | #endif | 137 | 557k | chorba2 = _mm_xor_si128(chorba2, chorba8); | 138 | 557k | chorba1 = _mm_xor_si128(chorba1, chorba7); | 139 | 557k | src += 16*8; | 140 | 557k | len -= 16*8; | 141 | | | 142 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src); | 143 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 1); | 144 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 2); | 145 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 3); | 146 | | | 147 | 557k | fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 148 | | #ifdef COPY | 149 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 150 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 151 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 152 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 153 | | dst += 64; | 154 | | #endif | 155 | 557k | xmm_t0 = _mm_xor_si128(xmm_t0, chorba6); | 156 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8); | 157 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7); | 158 | 557k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6); | 159 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 160 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 161 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 162 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 163 | | | 164 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 4); | 165 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 5); | 166 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 6); | 167 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 7); | 168 | | | 169 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 170 | | #ifdef COPY | 171 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 172 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 173 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 174 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 175 | | dst += 64; | 176 | | #endif | 177 | | | 178 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5); | 179 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5); | 180 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4); | 181 | 557k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3); | 182 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 183 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 184 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 185 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 186 | | | 187 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 8); | 188 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 9); | 189 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 10); | 190 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 11); | 191 | | | 192 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 193 | | #ifdef COPY | 194 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 195 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 196 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 197 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 198 | | dst += 64; | 199 | | #endif | 200 | | | 201 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8); | 202 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7); | 203 | 557k | xmm_t2 = _mm_xor_si128(xmm_t2, chorba6); | 204 | 557k | xmm_t3 = _mm_xor_si128(xmm_t3, chorba5); | 205 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 206 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 207 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 208 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 209 | | | 210 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 12); | 211 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 13); | 212 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 14); | 213 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 15); | 214 | | | 215 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 216 | | #ifdef COPY | 217 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 218 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 219 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 220 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 221 | | dst += 64; | 222 | | #endif | 223 | | | 224 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8); | 225 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7); | 226 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6); | 227 | 557k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5); | 228 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 229 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 230 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 231 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 232 | | | 233 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 16); | 234 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 17); | 235 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 18); | 236 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 19); | 237 | | | 238 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 239 | | #ifdef COPY | 240 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 241 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 242 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 243 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 244 | | dst += 64; | 245 | | #endif | 246 | | | 247 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5); | 248 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5); | 249 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6); | 250 | 557k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5); | 251 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 252 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 253 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 254 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 255 | | | 256 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 20); | 257 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 21); | 258 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 22); | 259 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 23); | 260 | | | 261 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 262 | | #ifdef COPY | 263 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 264 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 265 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 266 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 267 | | dst += 64; | 268 | | #endif | 269 | | | 270 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); | 271 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6); | 272 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5); | 273 | 557k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); | 274 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 275 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 276 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 277 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 278 | | | 279 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 24); | 280 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 25); | 281 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 26); | 282 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 27); | 283 | | | 284 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 285 | | #ifdef COPY | 286 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 287 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 288 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 289 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 290 | | dst += 64; | 291 | | #endif | 292 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6); | 293 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5); | 294 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5); | 295 | 557k | xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5); | 296 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 297 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 298 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 299 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 300 | | | 301 | 557k | xmm_t0 = _mm_loadu_si128((__m128i *)src + 28); | 302 | 557k | xmm_t1 = _mm_loadu_si128((__m128i *)src + 29); | 303 | 557k | xmm_t2 = _mm_loadu_si128((__m128i *)src + 30); | 304 | 557k | xmm_t3 = _mm_loadu_si128((__m128i *)src + 31); | 305 | | | 306 | 557k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 307 | | #ifdef COPY | 308 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 309 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 310 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 311 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 312 | | dst += 64; | 313 | | #endif | 314 | 557k | xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4); | 315 | 557k | xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3); | 316 | 557k | xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2); | 317 | 557k | xmm_t3 = _mm_xor_si128(xmm_t3, chorba1); | 318 | 557k | xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); | 319 | 557k | xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); | 320 | 557k | xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); | 321 | 557k | xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); | 322 | | | 323 | 557k | len -= 512; | 324 | 557k | src += 512; | 325 | 557k | } | 326 | 1.37M | #endif /* WITHOUT_CHORBA */ | 327 | | | 328 | 1.89M | while (len >= 64) { | 329 | 522k | len -= 64; | 330 | 522k | xmm_t0 = _mm_load_si128((__m128i *)src); | 331 | 522k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 332 | 522k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 333 | 522k | xmm_t3 = _mm_load_si128((__m128i *)src + 3); | 334 | 522k | src += 64; | 335 | | | 336 | 522k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 337 | | #ifdef COPY | 338 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 339 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 340 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 341 | | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 342 | | dst += 64; | 343 | | #else | 344 | 522k | XOR_INITIAL128(xmm_t0); | 345 | 522k | #endif | 346 | | | 347 | 522k | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); | 348 | 522k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); | 349 | 522k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); | 350 | 522k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); | 351 | 522k | } | 352 | | | 353 | | /* | 354 | | * len = num bytes left - 64 | 355 | | */ | 356 | 1.37M | if (len >= 48) { | 357 | 176k | len -= 48; | 358 | | | 359 | 176k | xmm_t0 = _mm_load_si128((__m128i *)src); | 360 | 176k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 361 | 176k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 362 | 176k | src += 48; | 363 | | #ifdef COPY | 364 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 365 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 366 | | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 367 | | dst += 48; | 368 | | #else | 369 | 176k | XOR_INITIAL128(xmm_t0); | 370 | 176k | #endif | 371 | 176k | fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 372 | | | 373 | 176k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); | 374 | 176k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); | 375 | 176k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); | 376 | 1.19M | } else if (len >= 32) { | 377 | 194k | len -= 32; | 378 | | | 379 | 194k | xmm_t0 = _mm_load_si128((__m128i *)src); | 380 | 194k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 381 | 194k | src += 32; | 382 | | #ifdef COPY | 383 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 384 | | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 385 | | dst += 32; | 386 | | #else | 387 | 194k | XOR_INITIAL128(xmm_t0); | 388 | 194k | #endif | 389 | 194k | fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 390 | | | 391 | 194k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); | 392 | 194k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); | 393 | 1.00M | } else if (len >= 16) { | 394 | 894k | len -= 16; | 395 | 894k | xmm_t0 = _mm_load_si128((__m128i *)src); | 396 | 894k | src += 16; | 397 | | #ifdef COPY | 398 | | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 399 | | dst += 16; | 400 | | #else | 401 | 894k | XOR_INITIAL128(xmm_t0); | 402 | 894k | #endif | 403 | 894k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 404 | | | 405 | 894k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 406 | 894k | } | 407 | | | 408 | 1.43M | partial: | 409 | 1.43M | if (len) { | 410 | 1.17M | memcpy(&xmm_crc_part, src, len); | 411 | | #ifdef COPY | 412 | | _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part); | 413 | | memcpy(dst, partial_buf, len); | 414 | | #endif | 415 | 1.17M | partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); | 416 | 1.17M | } | 417 | | | 418 | 1.43M | crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 419 | 1.43M | } |
Unexecuted instantiation: crc32_fold_vpclmulqdq |