Coverage Report

Created: 2025-10-10 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/chorba_sse2.c
Line
Count
Source
1
#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2)
2
3
#include "zbuild.h"
4
#include "crc32_braid_p.h"
5
#include "crc32_braid_tbl.h"
6
#include "crc32.h"
7
#include <emmintrin.h>
8
#include "arch/x86/x86_intrins.h"
9
#include "arch/generic/generic_functions.h"
10
#include <assert.h>
11
12
uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len);
13
14
0
#define READ_NEXT(in, off, a, b) do { \
15
0
        a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
16
0
        b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
17
0
        } while (0);
18
19
0
#define NEXT_ROUND(invec, a, b, c, d) do { \
20
0
        a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
21
0
        b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
22
0
        c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
23
0
        d  = _mm_srli_epi64(invec, 20); \
24
0
        } while (0);
25
26
0
Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint64_t* buf, size_t len) {
27
0
    const uint64_t* input = buf;
28
0
    ALIGNED_(16) uint64_t final[9] = {0};
29
0
    uint64_t next1 = crc;
30
0
    crc = 0;
31
0
    uint64_t next2 = 0;
32
0
    uint64_t next3 = 0;
33
0
    uint64_t next4 = 0;
34
0
    uint64_t next5 = 0;
35
36
0
    __m128i next12 = _mm_cvtsi64_si128(next1);
37
0
    __m128i next34 = _mm_setzero_si128();
38
0
    __m128i next56 = _mm_setzero_si128();
39
0
    __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4;
40
41
0
    size_t i = 0;
42
43
    /* This is weird, doing for vs while drops 10% off the exec time */
44
0
    for(; (i + 256 + 40 + 32 + 32) < len; i += 32) {
45
0
        __m128i in1in2, in3in4;
46
47
        /*
48
        uint64_t chorba1 = input[i / sizeof(uint64_t)];
49
        uint64_t chorba2 = input[i / sizeof(uint64_t) + 1];
50
        uint64_t chorba3 = input[i / sizeof(uint64_t) + 2];
51
        uint64_t chorba4 = input[i / sizeof(uint64_t) + 3];
52
        uint64_t chorba5 = input[i / sizeof(uint64_t) + 4];
53
        uint64_t chorba6 = input[i / sizeof(uint64_t) + 5];
54
        uint64_t chorba7 = input[i / sizeof(uint64_t) + 6];
55
        uint64_t chorba8 = input[i / sizeof(uint64_t) + 7];
56
        */
57
58
0
        const uint64_t *inputPtr = input + (i / sizeof(uint64_t));
59
0
        const __m128i *inputPtr128 = (__m128i*)inputPtr;
60
0
        __m128i chorba12 = _mm_load_si128(inputPtr128++);
61
0
        __m128i chorba34 = _mm_load_si128(inputPtr128++);
62
0
        __m128i chorba56 = _mm_load_si128(inputPtr128++);
63
0
        __m128i chorba78 = _mm_load_si128(inputPtr128++);
64
65
0
        chorba12 = _mm_xor_si128(chorba12, next12);
66
0
        chorba34 = _mm_xor_si128(chorba34, next34);
67
0
        chorba56 = _mm_xor_si128(chorba56, next56);
68
0
        chorba78 = _mm_xor_si128(chorba78, chorba12);
69
0
        __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1));
70
0
        __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12),
71
0
                                                           _mm_castsi128_pd(chorba34), 1));
72
        /*
73
        chorba1 ^= next1;
74
        chorba2 ^= next2;
75
        chorba3 ^= next3;
76
        chorba4 ^= next4;
77
        chorba5 ^= next5;
78
        chorba7 ^= chorba1;
79
        chorba8 ^= chorba2;
80
        */
81
0
        i += 8 * 8;
82
83
        /* 0-3 */
84
        /*in1 = input[i / sizeof(uint64_t)];
85
        in2 = input[i / sizeof(uint64_t) + 1];*/
86
0
        READ_NEXT(input, i, in1in2, in3in4);
87
0
        __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12));
88
0
        in1in2 = _mm_xor_si128(in1in2, chorba34xor);
89
        /*
90
        in1 ^= chorba3;
91
        in2 ^= chorba4 ^ chorba1;
92
        */
93
94
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
95
        /*
96
        a1 = (in1 << 17) ^ (in1 << 55);
97
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
98
        a3 = (in1 >> 45) ^ (in1 << 44);
99
        a4 = (in1 >> 20);
100
101
        b1 = (in2 << 17) ^ (in2 << 55);
102
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
103
        b3 = (in2 >> 45) ^ (in2 << 44);
104
        b4 = (in2 >> 20);
105
106
        */
107
108
0
        in3in4 = _mm_xor_si128(in3in4, ab1);
109
        /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */
110
0
        __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2));
111
112
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23));
113
0
        in3in4 = _mm_xor_si128(in3in4, chorba12);
114
115
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
116
        /*
117
        in3 = input[i / sizeof(uint64_t) + 2];
118
        in4 = input[i / sizeof(uint64_t) + 3];
119
        in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1;
120
        in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2;
121
122
        c1 = (in3 << 17) ^ (in3 << 55);
123
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
124
        c3 = (in3 >> 45) ^ (in3 << 44);
125
        c4 = (in3 >> 20);
126
127
        d1 = (in4 << 17) ^ (in4 << 55);
128
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
129
        d3 = (in4 >> 45) ^ (in4 << 44);
130
        d4 = (in4 >> 20);
131
        */
132
133
0
        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
134
0
        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
135
0
        a4_ = _mm_xor_si128(b2c2, a4_);
136
0
        next12 = _mm_xor_si128(ab3, a4_);
137
0
        next12 = _mm_xor_si128(next12, cd1);
138
139
0
        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
140
0
        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
141
142
        /*out1 = a3 ^ b2 ^ c1;
143
        out2 = b3 ^ c2 ^ d1 ^ a4;*/
144
0
        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
145
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
146
147
        //out3 = b4 ^ c3 ^ d2;
148
        //out4 = c4 ^ d3;
149
150
        //out5 = d4;
151
152
        /*
153
        next1 = out1;
154
        next2 = out2;
155
        next3 = out3;
156
        next4 = out4;
157
        next5 = out5;
158
        */
159
160
0
        i += 32;
161
162
        /* 4-7 */
163
        /*in1 = input[i / sizeof(uint64_t)];
164
        in2 = input[i / sizeof(uint64_t) + 1];*/
165
0
        READ_NEXT(input, i, in1in2, in3in4);
166
167
0
        in1in2 = _mm_xor_si128(in1in2, next12);
168
0
        in1in2 = _mm_xor_si128(in1in2, chorba78);
169
0
        in1in2 = _mm_xor_si128(in1in2, chorba45);
170
0
        in1in2 = _mm_xor_si128(in1in2, chorba34);
171
172
        /*
173
        in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3;
174
        in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4;
175
        */
176
177
        /*
178
        a1 = (in1 << 17) ^ (in1 << 55);
179
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
180
        a3 = (in1 >> 45) ^ (in1 << 44);
181
        a4 = (in1 >> 20);
182
183
        b1 = (in2 << 17) ^ (in2 << 55);
184
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
185
        b3 = (in2 >> 45) ^ (in2 << 44);
186
        b4 = (in2 >> 20);
187
        */
188
189
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
190
191
        /*
192
        in3 = input[i / sizeof(uint64_t) + 2];
193
        in4 = input[i / sizeof(uint64_t) + 3];
194
195
        in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5;
196
        in4 ^= next4 ^ b1 ^ a2  ^ chorba7 ^ chorba6;
197
        */
198
0
        in3in4 = _mm_xor_si128(in3in4, next34);
199
0
        in3in4 = _mm_xor_si128(in3in4, ab1);
200
0
        in3in4 = _mm_xor_si128(in3in4, chorba56);
201
0
        __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1));
202
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
203
204
        /*
205
        c1 = (in3 << 17) ^ (in3 << 55);
206
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
207
        c3 = (in3 >> 45) ^ (in3 << 44);
208
        c4 = (in3 >> 20);
209
210
        d1 = (in4 << 17) ^ (in4 << 55);
211
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
212
        d3 = (in4 >> 45) ^ (in4 << 44);
213
        d4 = (in4 >> 20);
214
        */
215
216
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
217
218
        ///*
219
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
220
0
        a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
221
0
        a4_ = _mm_xor_si128(b2c2, a4_);
222
0
        next12 = _mm_xor_si128(ab3, cd1);
223
224
0
        next12 = _mm_xor_si128(next12, a4_);
225
0
        next12 = _mm_xor_si128(next12, next56);
226
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
227
0
        next34 = _mm_xor_si128(b4c4, cd3);
228
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
229
0
        next34 = _mm_xor_si128(next34, d2_);
230
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
231
        //*/
232
233
        /*
234
        out1 = a3 ^ b2 ^ c1;
235
        out2 = b3 ^ c2 ^ d1 ^ a4;
236
        out3 = b4 ^ c3 ^ d2;
237
        out4 = c4 ^ d3;
238
        out5 = d4;
239
240
        next1 = next5 ^ out1;
241
        next2 = out2;
242
        next3 = out3;
243
        next4 = out4;
244
        next5 = out5;
245
        */
246
247
0
        i += 32;
248
249
        /* 8-11 */
250
        /*
251
        in1 = input[i / sizeof(uint64_t)];
252
        in2 = input[i / sizeof(uint64_t) + 1];
253
        in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1;
254
        in2 ^= next2 ^ chorba8 ^ chorba2;
255
        */
256
257
0
        READ_NEXT(input, i, in1in2, in3in4);
258
259
0
        __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128());
260
0
        __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12);
261
0
        in1in2 = _mm_xor_si128(in1in2, chorba80);
262
0
        in1in2 = _mm_xor_si128(in1in2, chorba78);
263
0
        in1in2 = _mm_xor_si128(in1in2, next12_chorba12);
264
265
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
266
267
        /*
268
        a1 = (in1 << 17) ^ (in1 << 55);
269
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
270
        a3 = (in1 >> 45) ^ (in1 << 44);
271
        a4 = (in1 >> 20);
272
273
        b1 = (in2 << 17) ^ (in2 << 55);
274
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
275
        b3 = (in2 >> 45) ^ (in2 << 44);
276
        b4 = (in2 >> 20);
277
        */
278
279
        /*in3 = input[i / sizeof(uint64_t) + 2];
280
        in4 = input[i / sizeof(uint64_t) + 3];*/
281
0
        in3in4 = _mm_xor_si128(next34, in3in4);
282
0
        in3in4 = _mm_xor_si128(in3in4, ab1);
283
0
        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
284
0
        in3in4 = _mm_xor_si128(in3in4, chorba34);
285
0
        in3in4 = _mm_xor_si128(in3in4, a2_);
286
287
        /*
288
        in3 ^= next3 ^ a1 ^ chorba3;
289
        in4 ^= next4 ^ a2 ^ b1 ^ chorba4;
290
291
        c1 = (in3 << 17) ^ (in3 << 55);
292
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
293
        c3 = (in3 >> 45) ^ (in3 << 44);
294
        c4 = (in3 >> 20);
295
296
        d1 = (in4 << 17) ^ (in4 << 55);
297
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
298
        d3 = (in4 >> 45) ^ (in4 << 44);
299
        d4 = (in4 >> 20);
300
        */
301
302
303
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
304
305
0
        a4_ = _mm_unpacklo_epi64(next56, ab4);
306
0
        next12 = _mm_xor_si128(a4_, ab3);
307
0
        next12 = _mm_xor_si128(next12, cd1);
308
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
309
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
310
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
311
0
        next12 = _mm_xor_si128(next12, b2c2);
312
0
        next34 = _mm_xor_si128(b4c4, cd3);
313
0
        next34 = _mm_xor_si128(next34, d2_);
314
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
315
316
        /*
317
        out1 =      a3 ^ b2 ^ c1;
318
        out2 = a4 ^ b3 ^ c2 ^ d1;
319
        out3 = b4 ^ c3 ^ d2;
320
        out4 = c4 ^ d3;
321
        out5 = d4;
322
323
        next1 = next5 ^ out1;
324
        next2 = out2;
325
        next3 = out3;
326
        next4 = out4;
327
        next5 = out5;
328
        */
329
330
0
        i += 32;
331
332
        /* 12-15 */
333
        /*
334
        in1 = input[i / sizeof(uint64_t)];
335
        in2 = input[i / sizeof(uint64_t) + 1];
336
        */
337
0
        READ_NEXT(input, i, in1in2, in3in4);
338
0
        in1in2 = _mm_xor_si128(in1in2, next12);
339
0
        __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12);
340
0
        in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12);
341
0
        __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
342
0
        in1in2 = _mm_xor_si128(in1in2, chorb1_);
343
344
345
        /*
346
        in1 ^= next1 ^ chorba5 ^ chorba1;
347
        in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1;
348
349
        a1 = (in1 << 17) ^ (in1 << 55);
350
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
351
        a3 = (in1 >> 45) ^ (in1 << 44);
352
        a4 = (in1 >> 20);
353
354
        b1 = (in2 << 17) ^ (in2 << 55);
355
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
356
        b3 = (in2 >> 45) ^ (in2 << 44);
357
        b4 = (in2 >> 20);
358
        */
359
360
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
361
362
        /*
363
        in3 = input[i / sizeof(uint64_t) + 2];
364
        in4 = input[i / sizeof(uint64_t) + 3];
365
        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1;
366
        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2;
367
        */
368
369
0
        in3in4 = _mm_xor_si128(next34, in3in4);
370
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
371
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12));
372
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
373
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
374
375
        /*
376
377
        c1 = (in3 << 17) ^ (in3 << 55);
378
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
379
        c3 = (in3 >> 45) ^ (in3 << 44);
380
        c4 = (in3 >> 20);
381
382
        d1 = (in4 << 17) ^ (in4 << 55);
383
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
384
        d3 = (in4 >> 45) ^ (in4 << 44);
385
        d4 = (in4 >> 20);
386
        */
387
388
        ///*
389
0
        a4_ = _mm_unpacklo_epi64(next56, ab4);
390
0
        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
391
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
392
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
393
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
394
0
        next12 = _mm_xor_si128(next12, b2c2);
395
0
        next34 = _mm_xor_si128(b4c4, cd3);
396
0
        next34 = _mm_xor_si128(next34, d2_);
397
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
398
        //*/
399
400
        /*
401
        out1 =      a3 ^ b2 ^ c1;
402
        out2 = a4 ^ b3 ^ c2 ^ d1;
403
        out3 = b4 ^ c3 ^ d2;
404
        out4 = c4 ^ d3;
405
        out5 = d4;
406
407
        next1 = next5 ^ out1;
408
        next2 = out2;
409
        next3 = out3;
410
        next4 = out4;
411
        next5 = out5;
412
        */
413
414
0
        i += 32;
415
416
        /* 16-19 */
417
        /*
418
        in1 = input[i / sizeof(uint64_t)];
419
        in2 = input[i / sizeof(uint64_t) + 1];
420
        in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1;
421
        in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2;
422
        */
423
        ///*
424
0
        READ_NEXT(input, i, in1in2, in3in4);
425
0
        __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
426
0
        in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45));
427
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34));
428
0
        in1in2 = _mm_xor_si128(chorba1_, in1in2);
429
430
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
431
        //*/
432
433
        /*
434
        a1 = (in1 << 17) ^ (in1 << 55);
435
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
436
        a3 = (in1 >> 45) ^ (in1 << 44);
437
        a4 = (in1 >> 20);
438
439
        b1 = (in2 << 17) ^ (in2 << 55);
440
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
441
        b3 = (in2 >> 45) ^ (in2 << 44);
442
        b4 = (in2 >> 20);
443
        */
444
445
        /*
446
        in3 = input[i / sizeof(uint64_t) + 2];
447
        in4 = input[i / sizeof(uint64_t) + 3];
448
        */
449
        ///*
450
0
        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
451
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
452
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34));
453
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67));
454
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
455
0
        in3in4 = _mm_xor_si128(in3in4, next34);
456
        //*/
457
        /*
458
        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3;
459
        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1;
460
        */
461
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
462
463
        /*
464
        c1 = (in3 << 17) ^ (in3 << 55);
465
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
466
        c3 = (in3 >> 45) ^ (in3 << 44);
467
        c4 = (in3 >> 20);
468
469
        d1 = (in4 << 17) ^ (in4 << 55);
470
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
471
        d3 = (in4 >> 45) ^ (in4 << 44);
472
        d4 = (in4 >> 20);
473
        */
474
475
0
        a4_ = _mm_unpacklo_epi64(next56, ab4);
476
0
        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
477
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
478
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
479
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
480
0
        next12 = _mm_xor_si128(next12, b2c2);
481
0
        next34 = _mm_xor_si128(b4c4, cd3);
482
0
        next34 = _mm_xor_si128(next34, d2_);
483
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
484
485
        /*
486
        out1 =      a3 ^ b2 ^ c1;
487
        out2 = a4 ^ b3 ^ c2 ^ d1;
488
        out3 = b4 ^ c3 ^ d2;
489
        out4 = c4 ^ d3;
490
        out5 = d4;
491
492
        next1 = next5 ^ out1;
493
        next2 = out2;
494
        next3 = out3;
495
        next4 = out4;
496
        next5 = out5;
497
        */
498
499
0
        i += 32;
500
501
        /* 20-23 */
502
        /*
503
        in1 = input[i / sizeof(uint64_t)];
504
        in2 = input[i / sizeof(uint64_t) + 1];
505
        in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1;
506
        in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2;
507
        */
508
509
0
        READ_NEXT(input, i, in1in2, in3in4);
510
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
511
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56));
512
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
513
0
        in1in2 = _mm_xor_si128(in1in2, chorba80);
514
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
515
516
        /*
517
        a1 = (in1 << 17) ^ (in1 << 55);
518
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
519
        a3 = (in1 >> 45) ^ (in1 << 44);
520
        a4 = (in1 >> 20);
521
522
        b1 = (in2 << 17) ^ (in2 << 55);
523
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
524
        b3 = (in2 >> 45) ^ (in2 << 44);
525
        b4 = (in2 >> 20);
526
        */
527
528
        /*
529
        in3 = input[i / sizeof(uint64_t) + 2];
530
        in4 = input[i / sizeof(uint64_t) + 3];
531
        in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
532
        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
533
        */
534
0
        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
535
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
536
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67));
537
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
538
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_));
539
0
        in3in4 = _mm_xor_si128(in3in4, chorba12);
540
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
541
542
        /*
543
        c1 = (in3 << 17) ^ (in3 << 55);
544
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
545
        c3 = (in3 >> 45) ^ (in3 << 44);
546
        c4 = (in3 >> 20);
547
548
        d1 = (in4 << 17) ^ (in4 << 55);
549
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
550
        d3 = (in4 >> 45) ^ (in4 << 44);
551
        d4 = (in4 >> 20);
552
        */
553
554
        /*
555
        out1 =      a3 ^ b2 ^ c1;
556
        out2 = a4 ^ b3 ^ c2 ^ d1;
557
        out3 = b4 ^ c3 ^ d2;
558
        out4 = c4 ^ d3;
559
        out5 = d4;
560
561
        next1 = next5 ^ out1;
562
        next2 = out2;
563
        next3 = out3;
564
        next4 = out4;
565
        next5 = out5;
566
        */
567
568
0
        a4_ = _mm_unpacklo_epi64(next56, ab4);
569
0
        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
570
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
571
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
572
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
573
0
        next12 = _mm_xor_si128(next12, b2c2);
574
0
        next34 = _mm_xor_si128(b4c4, cd3);
575
0
        next34 = _mm_xor_si128(next34, d2_);
576
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
577
578
0
        i += 32;
579
580
        /* 24-27 */
581
        /*
582
        in1 = input[i / sizeof(uint64_t)];
583
        in2 = input[i / sizeof(uint64_t) + 1];
584
        in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
585
        in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2;
586
        */
587
588
0
        READ_NEXT(input, i, in1in2, in3in4);
589
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67));
590
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34));
591
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12));
592
0
        in1in2 = _mm_xor_si128(in1in2, chorba80);
593
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
594
595
        /*
596
        a1 = (in1 << 17) ^ (in1 << 55);
597
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
598
        a3 = (in1 >> 45) ^ (in1 << 44);
599
        a4 = (in1 >> 20);
600
601
        b1 = (in2 << 17) ^ (in2 << 55);
602
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
603
        b3 = (in2 >> 45) ^ (in2 << 44);
604
        b4 = (in2 >> 20);
605
        */
606
607
        /*in3 = input[i / sizeof(uint64_t) + 2];
608
        in4 = input[i / sizeof(uint64_t) + 3];
609
        in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3;
610
        in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4;
611
612
        c1 = (in3 << 17) ^ (in3 << 55);
613
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
614
        c3 = (in3 >> 45) ^ (in3 << 44);
615
        c4 = (in3 >> 20);
616
617
        d1 = (in4 << 17) ^ (in4 << 55);
618
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
619
        d3 = (in4 >> 45) ^ (in4 << 44);
620
        d4 = (in4 >> 20);
621
        */
622
0
        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
623
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
624
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56));
625
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
626
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_));
627
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
628
629
0
        a4_ = _mm_unpacklo_epi64(next56, ab4);
630
0
        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
631
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
632
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
633
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
634
0
        next12 = _mm_xor_si128(next12, b2c2);
635
0
        next34 = _mm_xor_si128(b4c4, cd3);
636
0
        next34 = _mm_xor_si128(next34, d2_);
637
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
638
639
        /*
640
        out1 =      a3 ^ b2 ^ c1;
641
        out2 = a4 ^ b3 ^ c2 ^ d1;
642
        out3 = b4 ^ c3 ^ d2;
643
        out4 = c4 ^ d3;
644
        out5 = d4;
645
646
        next1 = next5 ^ out1;
647
        next2 = out2;
648
        next3 = out3;
649
        next4 = out4;
650
        next5 = out5;
651
        */
652
0
        i += 32;
653
654
        /* 28-31 */
655
        /*
656
        in1 = input[i / sizeof(uint64_t)];
657
        in2 = input[i / sizeof(uint64_t) + 1];
658
        in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5;
659
        in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6;
660
        */
661
0
        READ_NEXT(input, i, in1in2, in3in4);
662
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78));
663
0
        in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56));
664
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
665
666
        /*
667
        a1 = (in1 << 17) ^ (in1 << 55);
668
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
669
        a3 = (in1 >> 45) ^ (in1 << 44);
670
        a4 = (in1 >> 20);
671
672
        b1 = (in2 << 17) ^ (in2 << 55);
673
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
674
        b3 = (in2 >> 45) ^ (in2 << 44);
675
        b4 = (in2 >> 20);
676
        */
677
678
        /*
679
        in3 = input[i / sizeof(uint64_t) + 2];
680
        in4 = input[i / sizeof(uint64_t) + 3];
681
        in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7;
682
        in4 ^= next4 ^ a2 ^ b1 ^ chorba8;
683
684
        c1 = (in3 << 17) ^ (in3 << 55);
685
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
686
        c3 = (in3 >> 45) ^ (in3 << 44);
687
        c4 = (in3 >> 20);
688
689
        d1 = (in4 << 17) ^ (in4 << 55);
690
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
691
        d3 = (in4 >> 45) ^ (in4 << 44);
692
        d4 = (in4 >> 20);
693
        */
694
0
        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
695
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
696
0
        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80));
697
0
        in3in4 = _mm_xor_si128(a2_, in3in4);
698
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
699
700
        /*
701
        out1 =      a3 ^ b2 ^ c1;
702
        out2 = a4 ^ b3 ^ c2 ^ d1;
703
        out3 = b4 ^ c3 ^ d2;
704
        out4 = c4 ^ d3;
705
        out5 = d4;
706
        */
707
708
        /*
709
        next1 = next5 ^ out1;
710
        next2 = out2;
711
        next3 = out3;
712
        next4 = out4;
713
        next5 = out5;
714
        */
715
716
0
        a4_ = _mm_unpacklo_epi64(next56, ab4);
717
0
        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
718
0
        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
719
0
        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
720
0
        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
721
0
        next12 = _mm_xor_si128(next12, b2c2);
722
0
        next34 = _mm_xor_si128(b4c4, cd3);
723
0
        next34 = _mm_xor_si128(next34, d2_);
724
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
725
0
    }
726
727
0
    for(; (i + 40 + 32) < len; i += 32) {
728
0
        __m128i in1in2, in3in4;
729
730
        /*in1 = input[i / sizeof(uint64_t)];
731
        in2 = input[i / sizeof(uint64_t) + 1];*/
732
        //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4);
733
0
        READ_NEXT(input, i, in1in2, in3in4);
734
0
        in1in2 = _mm_xor_si128(in1in2, next12);
735
736
        /*
737
        in1 ^=next1;
738
        in2 ^=next2;
739
        */
740
741
0
        NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
742
        /*
743
        a1 = (in1 << 17) ^ (in1 << 55);
744
        a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19);
745
        a3 = (in1 >> 45) ^ (in1 << 44);
746
        a4 = (in1 >> 20);
747
748
        b1 = (in2 << 17) ^ (in2 << 55);
749
        b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19);
750
        b3 = (in2 >> 45) ^ (in2 << 44);
751
        b4 = (in2 >> 20);
752
        */
753
754
        /*
755
        in3 = input[i / sizeof(uint64_t) + 2];
756
        in4 = input[i / sizeof(uint64_t) + 3];
757
        in3 ^= next3 ^ a1;
758
        in4 ^= next4 ^ a2 ^ b1;
759
760
        c1 = (in3 << 17) ^ (in3 << 55);
761
        c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19);
762
        c3 = (in3 >> 45) ^ (in3 << 44);
763
        c4 = (in3 >> 20);
764
765
        d1 = (in4 << 17) ^ (in4 << 55);
766
        d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19);
767
        d3 = (in4 >> 45) ^ (in4 << 44);
768
        d4 = (in4 >> 20);
769
        */
770
771
0
        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
772
0
        __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
773
0
        in3in4 = _mm_xor_si128(in3in4, ab1_next34);
774
0
        in3in4 = _mm_xor_si128(a2_, in3in4);
775
0
        NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
776
777
        /*
778
779
        out1 = a3 ^ b2 ^ c1;
780
        out2 = a4 ^ b3 ^ c2 ^ d1;
781
        out3 = b4 ^ c3 ^ d2;
782
        out4 = c4 ^ d3;
783
        out5 = d4;
784
785
        next1 = next5 ^ out1;
786
        next2 = out2;
787
        next3 = out3;
788
        next4 = out4;
789
        next5 = out5;
790
        */
791
792
0
        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
793
0
        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
794
0
        a4_ = _mm_xor_si128(b2c2, a4_);
795
0
        next12 = _mm_xor_si128(ab3, a4_);
796
0
        next12 = _mm_xor_si128(next12, cd1);
797
798
0
        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
799
0
        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
800
0
        next12 = _mm_xor_si128(next12, next56);
801
0
        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
802
0
        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
803
0
    }
804
805
0
    next1 = _mm_cvtsi128_si64(next12);
806
0
    next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12));
807
0
    next3 = _mm_cvtsi128_si64(next34);
808
0
    next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34));
809
0
    next5 = _mm_cvtsi128_si64(next56);
810
811
    /* Skip the call to memcpy */
812
0
    size_t copy_len = len - i;
813
0
    __m128i *final128 = (__m128i*)final;
814
0
    __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t));
815
0
    while (copy_len >= 64) {
816
0
        _mm_store_si128(final128++, _mm_load_si128(input128++));
817
0
        _mm_store_si128(final128++, _mm_load_si128(input128++));
818
0
        _mm_store_si128(final128++, _mm_load_si128(input128++));
819
0
        _mm_store_si128(final128++, _mm_load_si128(input128++));
820
0
         copy_len -= 64;
821
0
    }
822
823
0
    while (copy_len >= 16) {
824
0
        _mm_store_si128(final128++, _mm_load_si128(input128++));
825
0
        copy_len -= 16;
826
0
    }
827
828
0
    uint8_t *src_bytes = (uint8_t*)input128;
829
0
    uint8_t *dst_bytes = (uint8_t*)final128;
830
0
    while (copy_len--) {
831
0
       *dst_bytes++ = *src_bytes++;
832
0
    }
833
834
0
    final[0] ^= next1;
835
0
    final[1] ^= next2;
836
0
    final[2] ^= next3;
837
0
    final[3] ^= next4;
838
0
    final[4] ^= next5;
839
840
    /* We perform the same loop that braid_internal is doing but we'll skip
841
     * the function call for this tiny tail */
842
0
    uint8_t *final_bytes = (uint8_t*)final;
843
0
    size_t rem = len - i;
844
845
0
    while (rem--) {
846
0
        crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8);
847
0
    }
848
849
0
    return crc;
850
0
}
851
852
0
Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
853
0
    uint32_t c;
854
0
    uint64_t* aligned_buf;
855
0
    size_t aligned_len;
856
857
0
    c = (~crc) & 0xffffffff;
858
0
    unsigned long algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15;
859
0
    if (algn_diff < len) {
860
0
        if (algn_diff) {
861
0
            c = crc32_braid_internal(c, buf, algn_diff);
862
0
        }
863
0
        aligned_buf = (uint64_t*) (buf + algn_diff);
864
0
        aligned_len = len - algn_diff;
865
0
        if(aligned_len > CHORBA_LARGE_THRESHOLD) {
866
0
            c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
867
0
        } else if (aligned_len > 72) {
868
0
            c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len);
869
0
        } else {
870
0
            c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len);
871
0
        }
872
0
    }
873
0
    else {
874
0
        c = crc32_braid_internal(c, buf, len);
875
0
    }
876
877
    /* Return the CRC, post-conditioned. */
878
0
    return c ^ 0xffffffff;
879
0
}
880
#endif