/src/zlib-ng/arch/x86/chorba_sse41.c
Line | Count | Source (jump to first uncovered line) |
1 | | #if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) |
2 | | |
3 | | #include "zbuild.h" |
4 | | #include "crc32_braid_p.h" |
5 | | #include "crc32_braid_tbl.h" |
6 | | #include "crc32.h" |
7 | | #include <emmintrin.h> |
8 | | #include <smmintrin.h> |
9 | | #include "arch/x86/x86_intrins.h" |
10 | | #include "arch/generic/generic_functions.h" |
11 | | #include <assert.h> |
12 | | |
13 | | uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len); |
14 | | uint32_t chorba_small_nondestructive_sse2(uint32_t c, const uint64_t *aligned_buf, size_t aligned_len); |
15 | | |
16 | 0 | #define READ_NEXT(in, off, a, b) do { \ |
17 | 0 | a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ |
18 | 0 | b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ |
19 | 0 | } while (0); |
20 | | |
21 | 0 | #define NEXT_ROUND(invec, a, b, c, d) do { \ |
22 | 0 | a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ |
23 | 0 | b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ |
24 | 0 | c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ |
25 | 0 | d = _mm_srli_epi64(invec, 20); \ |
26 | 0 | } while (0); |
27 | | |
28 | 0 | #define REALIGN_CHORBA(in0, in1, in2, in3, out0, out1, out2, out3, out4, shift) do { \ |
29 | 0 | out0 = _mm_slli_si128(in0, shift); \ |
30 | 0 | out1 = _mm_alignr_epi8(in1, in0, shift); \ |
31 | 0 | out2 = _mm_alignr_epi8(in2, in1, shift); \ |
32 | 0 | out3 = _mm_alignr_epi8(in3, in2, shift); \ |
33 | 0 | out4 = _mm_srli_si128(in3, shift); \ |
34 | 0 | } while (0) |
35 | | |
36 | 0 | #define STORE4(out0, out1, out2, out3, out) do { \ |
37 | 0 | _mm_store_si128(out++, out0); \ |
38 | 0 | _mm_store_si128(out++, out1); \ |
39 | 0 | _mm_store_si128(out++, out2); \ |
40 | 0 | _mm_store_si128(out++, out3); \ |
41 | 0 | } while (0) |
42 | | |
43 | 0 | #define READ4(out0, out1, out2, out3, in) do { \ |
44 | 0 | out0 = _mm_load_si128(in++); \ |
45 | 0 | out1 = _mm_load_si128(in++); \ |
46 | 0 | out2 = _mm_load_si128(in++); \ |
47 | 0 | out3 = _mm_load_si128(in++); \ |
48 | 0 | } while (0) |
49 | | |
50 | | /* This is intentionally shifted one down to compensate for the deferred store from |
51 | | * the last iteration */ |
52 | 0 | #define READ4_WITHXOR(out0, out1, out2, out3, xor0, xor1, xor2, xor3, in) do { \ |
53 | 0 | out0 = _mm_xor_si128(in[1], xor0); \ |
54 | 0 | out1 = _mm_xor_si128(in[2], xor1); \ |
55 | 0 | out2 = _mm_xor_si128(in[3], xor2); \ |
56 | 0 | out3 = _mm_xor_si128(in[4], xor3); \ |
57 | 0 | } while (0) |
58 | | |
59 | 0 | static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t crc, const uint64_t* buf, size_t len) { |
60 | 0 | const uint64_t* input = buf; |
61 | 0 | ALIGNED_(16) uint64_t bitbuffer[32768 / sizeof(uint64_t)]; |
62 | 0 | __m128i *bitbuffer_v = (__m128i*)bitbuffer; |
63 | 0 | const uint8_t* bitbufferbytes = (const uint8_t*) bitbuffer; |
64 | 0 | __m128i z = _mm_setzero_si128(); |
65 | |
|
66 | 0 | __m128i *bitbuf128 = &bitbuffer_v[64]; |
67 | 0 | __m128i *bitbuf144 = &bitbuffer_v[72]; |
68 | 0 | __m128i *bitbuf182 = &bitbuffer_v[91]; |
69 | 0 | __m128i *bitbuf210 = &bitbuffer_v[105]; |
70 | 0 | __m128i *bitbuf300 = &bitbuffer_v[150]; |
71 | 0 | __m128i *bitbuf0 = bitbuf128; |
72 | 0 | __m128i *inptr = (__m128i*)input; |
73 | | |
74 | | /* We only need to zero out the bytes between the 128'th value and the 144th |
75 | | * that are actually read */ |
76 | 0 | __m128i *z_cursor = bitbuf128; |
77 | 0 | for (size_t i = 0; i < 2; ++i) { |
78 | 0 | STORE4(z, z, z, z, z_cursor); |
79 | 0 | } |
80 | | |
81 | | /* We only need to zero out the bytes between the 144'th value and the 182nd that |
82 | | * are actually read */ |
83 | 0 | z_cursor = bitbuf144 + 8; |
84 | 0 | for (size_t i = 0; i < 11; ++i) { |
85 | 0 | _mm_store_si128(z_cursor++, z); |
86 | 0 | } |
87 | | |
88 | | /* We only need to zero out the bytes between the 182nd value and the 210th that |
89 | | * are actually read. */ |
90 | 0 | z_cursor = bitbuf182; |
91 | 0 | for (size_t i = 0; i < 4; ++i) { |
92 | 0 | STORE4(z, z, z, z, z_cursor); |
93 | 0 | } |
94 | | |
95 | | /* We need to mix this in */ |
96 | 0 | __m128i init_crc = _mm_cvtsi64_si128(crc); |
97 | 0 | crc = 0; |
98 | |
|
99 | 0 | size_t i = 0; |
100 | | |
101 | | /* Previous iteration runs carried over */ |
102 | 0 | __m128i buf144 = z; |
103 | 0 | __m128i buf182 = z; |
104 | 0 | __m128i buf210 = z; |
105 | |
|
106 | 0 | for(; i + 300*8+64 < len && i < 22 * 8; i += 64) { |
107 | 0 | __m128i in12, in34, in56, in78, |
108 | 0 | in_1, in23, in45, in67, in8_; |
109 | |
|
110 | 0 | READ4(in12, in34, in56, in78, inptr); |
111 | |
|
112 | 0 | if (i == 0) { |
113 | 0 | in12 = _mm_xor_si128(in12, init_crc); |
114 | 0 | } |
115 | |
|
116 | 0 | REALIGN_CHORBA(in12, in34, in56, in78, |
117 | 0 | in_1, in23, in45, in67, in8_, 8); |
118 | |
|
119 | 0 | __m128i a = _mm_xor_si128(buf144, in_1); |
120 | |
|
121 | 0 | STORE4(a, in23, in45, in67, bitbuf144); |
122 | 0 | buf144 = in8_; |
123 | |
|
124 | 0 | __m128i e = _mm_xor_si128(buf182, in_1); |
125 | 0 | STORE4(e, in23, in45, in67, bitbuf182); |
126 | 0 | buf182 = in8_; |
127 | |
|
128 | 0 | __m128i m = _mm_xor_si128(buf210, in_1); |
129 | 0 | STORE4(m, in23, in45, in67, bitbuf210); |
130 | 0 | buf210 = in8_; |
131 | |
|
132 | 0 | STORE4(in12, in34, in56, in78, bitbuf300); |
133 | 0 | } |
134 | |
|
135 | 0 | for(; i + 300*8+64 < len && i < 32 * 8; i += 64) { |
136 | 0 | __m128i in12, in34, in56, in78, |
137 | 0 | in_1, in23, in45, in67, in8_; |
138 | 0 | READ4(in12, in34, in56, in78, inptr); |
139 | |
|
140 | 0 | REALIGN_CHORBA(in12, in34, in56, in78, |
141 | 0 | in_1, in23, in45, in67, in8_, 8); |
142 | |
|
143 | 0 | __m128i a = _mm_xor_si128(buf144, in_1); |
144 | |
|
145 | 0 | STORE4(a, in23, in45, in67, bitbuf144); |
146 | 0 | buf144 = in8_; |
147 | |
|
148 | 0 | __m128i e, f, g, h; |
149 | 0 | e = _mm_xor_si128(buf182, in_1); |
150 | 0 | READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); |
151 | 0 | STORE4(e, f, g, h, bitbuf182); |
152 | |
|
153 | 0 | __m128i m = _mm_xor_si128(buf210, in_1); |
154 | 0 | STORE4(m, in23, in45, in67, bitbuf210); |
155 | 0 | buf210 = in8_; |
156 | |
|
157 | 0 | STORE4(in12, in34, in56, in78, bitbuf300); |
158 | 0 | } |
159 | |
|
160 | 0 | for(; i + 300*8+64 < len && i < 84 * 8; i += 64) { |
161 | 0 | __m128i in12, in34, in56, in78, |
162 | 0 | in_1, in23, in45, in67, in8_; |
163 | 0 | READ4(in12, in34, in56, in78, inptr); |
164 | |
|
165 | 0 | REALIGN_CHORBA(in12, in34, in56, in78, |
166 | 0 | in_1, in23, in45, in67, in8_, 8); |
167 | |
|
168 | 0 | __m128i a, b, c, d; |
169 | 0 | a = _mm_xor_si128(buf144, in_1); |
170 | 0 | READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144); |
171 | 0 | STORE4(a, b, c, d, bitbuf144); |
172 | |
|
173 | 0 | __m128i e, f, g, h; |
174 | 0 | e = _mm_xor_si128(buf182, in_1); |
175 | 0 | READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); |
176 | 0 | STORE4(e, f, g, h, bitbuf182); |
177 | |
|
178 | 0 | __m128i m = _mm_xor_si128(buf210, in_1); |
179 | 0 | STORE4(m, in23, in45, in67, bitbuf210); |
180 | 0 | buf210 = in8_; |
181 | |
|
182 | 0 | STORE4(in12, in34, in56, in78, bitbuf300); |
183 | 0 | } |
184 | |
|
185 | 0 | for(; i + 300*8+64 < len; i += 64) { |
186 | 0 | __m128i in12, in34, in56, in78, |
187 | 0 | in_1, in23, in45, in67, in8_; |
188 | |
|
189 | 0 | if (i < 128 * 8) { |
190 | 0 | READ4(in12, in34, in56, in78, inptr); |
191 | 0 | } else { |
192 | 0 | in12 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); |
193 | 0 | in34 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); |
194 | 0 | in56 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); |
195 | 0 | in78 = _mm_xor_si128(_mm_load_si128(inptr++), _mm_load_si128(bitbuf0++)); |
196 | 0 | } |
197 | | |
198 | | // [0, 145, 183, 211] |
199 | | |
200 | | /* Pre Penryn CPUs the unpack should be faster */ |
201 | 0 | REALIGN_CHORBA(in12, in34, in56, in78, |
202 | 0 | in_1, in23, in45, in67, in8_, 8); |
203 | |
|
204 | 0 | __m128i a, b, c, d; |
205 | 0 | a = _mm_xor_si128(buf144, in_1); |
206 | 0 | READ4_WITHXOR(b, c, d, buf144, in23, in45, in67, in8_, bitbuf144); |
207 | 0 | STORE4(a, b, c, d, bitbuf144); |
208 | |
|
209 | 0 | __m128i e, f, g, h; |
210 | 0 | e = _mm_xor_si128(buf182, in_1); |
211 | 0 | READ4_WITHXOR(f, g, h, buf182, in23, in45, in67, in8_, bitbuf182); |
212 | 0 | STORE4(e, f, g, h, bitbuf182); |
213 | |
|
214 | 0 | __m128i n, o, p; |
215 | 0 | __m128i m = _mm_xor_si128(buf210, in_1); |
216 | | |
217 | | /* Couldn't tell you why but despite knowing that this is always false, |
218 | | * removing this branch with GCC makes things significantly slower. Some |
219 | | * loop bodies must be being joined or something */ |
220 | 0 | if (i < 84 * 8) { |
221 | 0 | n = in23; |
222 | 0 | o = in45; |
223 | 0 | p = in67; |
224 | 0 | buf210 = in8_; |
225 | 0 | } else { |
226 | 0 | READ4_WITHXOR(n, o, p, buf210, in23, in45, in67, in8_, bitbuf210); |
227 | 0 | } |
228 | |
|
229 | 0 | STORE4(m, n, o, p, bitbuf210); |
230 | 0 | STORE4(in12, in34, in56, in78, bitbuf300); |
231 | 0 | } |
232 | | |
233 | | /* Second half of stores bubbled out */ |
234 | 0 | _mm_store_si128(bitbuf144, buf144); |
235 | 0 | _mm_store_si128(bitbuf182, buf182); |
236 | 0 | _mm_store_si128(bitbuf210, buf210); |
237 | | |
238 | | /* We also have to zero out the tail */ |
239 | 0 | size_t left_to_z = len - (300*8 + i); |
240 | 0 | __m128i *bitbuf_tail = (__m128i*)(bitbuffer + 300 + i/8); |
241 | 0 | while (left_to_z >= 64) { |
242 | 0 | STORE4(z, z, z, z, bitbuf_tail); |
243 | 0 | left_to_z -= 64; |
244 | 0 | } |
245 | |
|
246 | 0 | while (left_to_z >= 16) { |
247 | 0 | _mm_store_si128(bitbuf_tail++, z); |
248 | 0 | left_to_z -= 16; |
249 | 0 | } |
250 | |
|
251 | 0 | uint8_t *tail_bytes = (uint8_t*)bitbuf_tail; |
252 | 0 | while (left_to_z--) { |
253 | 0 | *tail_bytes++ = 0; |
254 | 0 | } |
255 | |
|
256 | 0 | ALIGNED_(16) uint64_t final[9] = {0}; |
257 | 0 | __m128i next12, next34, next56; |
258 | 0 | next12 = z; |
259 | 0 | next34 = z; |
260 | 0 | next56 = z; |
261 | |
|
262 | 0 | for(; (i + 72 < len); i += 32) { |
263 | 0 | __m128i in1in2, in3in4; |
264 | 0 | __m128i in1in2_, in3in4_; |
265 | 0 | __m128i ab1, ab2, ab3, ab4; |
266 | 0 | __m128i cd1, cd2, cd3, cd4; |
267 | |
|
268 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
269 | 0 | READ_NEXT(bitbuffer, i, in1in2_, in3in4_); |
270 | |
|
271 | 0 | in1in2 = _mm_xor_si128(_mm_xor_si128(in1in2, in1in2_), next12); |
272 | 0 | in3in4 = _mm_xor_si128(in3in4, in3in4_); |
273 | |
|
274 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
275 | |
|
276 | 0 | __m128i a2_ = _mm_slli_si128(ab2, 8); |
277 | 0 | __m128i ab1_next34 = _mm_xor_si128(next34, ab1); |
278 | 0 | in3in4 = _mm_xor_si128(in3in4, ab1_next34); |
279 | 0 | in3in4 = _mm_xor_si128(a2_, in3in4); |
280 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
281 | |
|
282 | 0 | __m128i b2c2 = _mm_alignr_epi8(cd2, ab2, 8); |
283 | 0 | __m128i a4_ = _mm_slli_si128(ab4, 8); |
284 | 0 | a4_ = _mm_xor_si128(b2c2, a4_); |
285 | 0 | next12 = _mm_xor_si128(ab3, a4_); |
286 | 0 | next12 = _mm_xor_si128(next12, cd1); |
287 | |
|
288 | 0 | __m128i d2_ = _mm_srli_si128(cd2, 8); |
289 | 0 | __m128i b4c4 = _mm_alignr_epi8(cd4, ab4, 8); |
290 | 0 | next12 = _mm_xor_si128(next12, next56); |
291 | 0 | next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); |
292 | 0 | next56 = _mm_srli_si128(cd4, 8); |
293 | 0 | } |
294 | |
|
295 | 0 | memcpy(final, input+(i / sizeof(uint64_t)), len-i); |
296 | 0 | __m128i *final128 = (__m128i*)final; |
297 | 0 | _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next12)); |
298 | 0 | ++final128; |
299 | 0 | _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next34)); |
300 | 0 | ++final128; |
301 | 0 | _mm_store_si128(final128, _mm_xor_si128(_mm_load_si128(final128), next56)); |
302 | |
|
303 | 0 | uint8_t* final_bytes = (uint8_t*) final; |
304 | |
|
305 | 0 | for(size_t j = 0; j < (len-i); j++) { |
306 | 0 | crc = crc_table[(crc ^ final_bytes[j] ^ bitbufferbytes[(j+i)]) & 0xff] ^ (crc >> 8); |
307 | 0 | } |
308 | 0 | return crc; |
309 | 0 | } |
310 | | |
311 | 0 | Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) { |
312 | 0 | uint32_t c; |
313 | 0 | uint64_t* aligned_buf; |
314 | 0 | size_t aligned_len; |
315 | |
|
316 | 0 | c = (~crc) & 0xffffffff; |
317 | 0 | uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15; |
318 | 0 | if (algn_diff < len) { |
319 | 0 | if (algn_diff) { |
320 | 0 | c = crc32_braid_internal(c, buf, algn_diff); |
321 | 0 | } |
322 | 0 | aligned_buf = (uint64_t*) (buf + algn_diff); |
323 | 0 | aligned_len = len - algn_diff; |
324 | 0 | if(aligned_len > CHORBA_LARGE_THRESHOLD) { |
325 | 0 | c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); |
326 | 0 | } else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && |
327 | 0 | aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) { |
328 | 0 | c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len); |
329 | 0 | } else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) { |
330 | 0 | c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len); |
331 | 0 | } else { |
332 | 0 | c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); |
333 | 0 | } |
334 | 0 | } |
335 | 0 | else { |
336 | 0 | c = crc32_braid_internal(c, buf, len); |
337 | 0 | } |
338 | | |
339 | | /* Return the CRC, post-conditioned. */ |
340 | 0 | return c ^ 0xffffffff; |
341 | 0 | } |
342 | | #endif |