/src/zlib-ng/arch/x86/crc32_pclmulqdq_tpl.h
Line | Count | Source |
1 | | /* crc32_pclmulqdq_tpl.h -- Compute the CRC32 using a parallelized folding |
2 | | * approach with the PCLMULQDQ and VPCMULQDQ instructions. |
3 | | * |
4 | | * A white paper describing this algorithm can be found at: |
5 | | * doc/crc-pclmulqdq.pdf |
6 | | * |
7 | | * Copyright (C) 2020 Wangyang Guo (wangyang.guo@intel.com) (VPCLMULQDQ support) |
8 | | * Copyright (C) 2013 Intel Corporation. All rights reserved. |
9 | | * Copyright (C) 2016 Marian Beermann (support for initial value) |
10 | | * Authors: |
11 | | * Wajdi Feghali <wajdi.k.feghali@intel.com> |
12 | | * Jim Guilford <james.guilford@intel.com> |
13 | | * Vinodh Gopal <vinodh.gopal@intel.com> |
14 | | * Erdinc Ozturk <erdinc.ozturk@intel.com> |
15 | | * Jim Kukunas <james.t.kukunas@linux.intel.com> |
16 | | * |
17 | | * For conditions of distribution and use, see copyright notice in zlib.h |
18 | | */ |
19 | | |
20 | | #include "zbuild.h" |
21 | | |
22 | | #include <immintrin.h> |
23 | | #include <wmmintrin.h> |
24 | | #include <smmintrin.h> // _mm_extract_epi32 |
25 | | |
26 | | #include "crc32_braid_p.h" |
27 | | #include "crc32_braid_tbl.h" |
28 | | #include "crc32_p.h" |
29 | | #include "x86_intrins.h" |
30 | | |
31 | | #ifdef X86_VPCLMULQDQ |
32 | | # if defined(_MSC_VER) && _MSC_VER < 1920 |
33 | | /* Use epi32 variants for older MSVC toolchains (v141/v140) to avoid cast warnings */ |
34 | | # define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi32(a, b, c, 0x96) |
35 | | # define z512_inserti64x2(a, b, imm) _mm512_inserti32x4(a, b, imm) |
36 | | # define z512_extracti64x2(a, imm) _mm512_extracti32x4_epi32(a, imm) |
37 | | # else |
38 | 0 | # define z512_xor3_epi64(a, b, c) _mm512_ternarylogic_epi64(a, b, c, 0x96) |
39 | 0 | # define z512_inserti64x2(a, b, imm) _mm512_inserti64x2(a, b, imm) |
40 | 0 | # define z512_extracti64x2(a, imm) _mm512_extracti64x2_epi64(a, imm) |
41 | | # endif |
42 | | # ifdef __AVX512VL__ |
43 | 0 | # define z128_xor3_epi64(a, b, c) _mm_ternarylogic_epi64(a, b, c, 0x96) |
44 | | # endif |
45 | | #endif |
46 | | |
47 | | #ifndef z128_xor3_epi64 |
48 | 28.9M | # define z128_xor3_epi64(a, b, c) _mm_xor_si128(_mm_xor_si128(a, b), c) |
49 | | #endif |
50 | | |
51 | 1.23M | static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { |
52 | 1.23M | __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
53 | 1.23M | __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
54 | | |
55 | 1.23M | *xmm_crc0 = *xmm_crc1; |
56 | 1.23M | *xmm_crc1 = *xmm_crc2; |
57 | 1.23M | *xmm_crc2 = *xmm_crc3; |
58 | 1.23M | *xmm_crc3 = _mm_xor_si128(x_low, x_high); |
59 | 1.23M | } Line | Count | Source | 51 | 1.23M | static inline void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { | 52 | 1.23M | __m128i x_low = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 53 | 1.23M | __m128i x_high = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 54 | | | 55 | 1.23M | *xmm_crc0 = *xmm_crc1; | 56 | 1.23M | *xmm_crc1 = *xmm_crc2; | 57 | 1.23M | *xmm_crc2 = *xmm_crc3; | 58 | 1.23M | *xmm_crc3 = _mm_xor_si128(x_low, x_high); | 59 | 1.23M | } |
Unexecuted instantiation: crc32_vpclmulqdq.c:fold_1 |
60 | | |
61 | 282k | static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { |
62 | 282k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
63 | 282k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
64 | 282k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
65 | 282k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
66 | | |
67 | 282k | *xmm_crc0 = *xmm_crc2; |
68 | 282k | *xmm_crc1 = *xmm_crc3; |
69 | 282k | *xmm_crc2 = _mm_xor_si128(x_low0, x_high0); |
70 | 282k | *xmm_crc3 = _mm_xor_si128(x_low1, x_high1); |
71 | 282k | } Line | Count | Source | 61 | 282k | static inline void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { | 62 | 282k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 63 | 282k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 64 | 282k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); | 65 | 282k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); | 66 | | | 67 | 282k | *xmm_crc0 = *xmm_crc2; | 68 | 282k | *xmm_crc1 = *xmm_crc3; | 69 | 282k | *xmm_crc2 = _mm_xor_si128(x_low0, x_high0); | 70 | 282k | *xmm_crc3 = _mm_xor_si128(x_low1, x_high1); | 71 | 282k | } |
Unexecuted instantiation: crc32_vpclmulqdq.c:fold_2 |
72 | | |
73 | 71.3k | static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { |
74 | 71.3k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
75 | 71.3k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
76 | 71.3k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
77 | 71.3k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
78 | 71.3k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); |
79 | 71.3k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); |
80 | | |
81 | 71.3k | *xmm_crc0 = *xmm_crc3; |
82 | 71.3k | *xmm_crc1 = _mm_xor_si128(x_low0, x_high0); |
83 | 71.3k | *xmm_crc2 = _mm_xor_si128(x_low1, x_high1); |
84 | 71.3k | *xmm_crc3 = _mm_xor_si128(x_low2, x_high2); |
85 | 71.3k | } Line | Count | Source | 73 | 71.3k | static inline void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { | 74 | 71.3k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 75 | 71.3k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 76 | 71.3k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); | 77 | 71.3k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); | 78 | 71.3k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); | 79 | 71.3k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); | 80 | | | 81 | 71.3k | *xmm_crc0 = *xmm_crc3; | 82 | 71.3k | *xmm_crc1 = _mm_xor_si128(x_low0, x_high0); | 83 | 71.3k | *xmm_crc2 = _mm_xor_si128(x_low1, x_high1); | 84 | 71.3k | *xmm_crc3 = _mm_xor_si128(x_low2, x_high2); | 85 | 71.3k | } |
Unexecuted instantiation: crc32_vpclmulqdq.c:fold_3 |
86 | | |
87 | 15.3M | static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { |
88 | 15.3M | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); |
89 | 15.3M | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); |
90 | 15.3M | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); |
91 | 15.3M | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); |
92 | 15.3M | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); |
93 | 15.3M | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); |
94 | 15.3M | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); |
95 | 15.3M | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); |
96 | | |
97 | 15.3M | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); |
98 | 15.3M | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); |
99 | 15.3M | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); |
100 | 15.3M | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); |
101 | 15.3M | } Line | Count | Source | 87 | 15.3M | static inline void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const __m128i xmm_fold4) { | 88 | 15.3M | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); | 89 | 15.3M | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x10); | 90 | 15.3M | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); | 91 | 15.3M | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); | 92 | 15.3M | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); | 93 | 15.3M | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); | 94 | 15.3M | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); | 95 | 15.3M | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); | 96 | | | 97 | 15.3M | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); | 98 | 15.3M | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); | 99 | 15.3M | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); | 100 | 15.3M | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); | 101 | 15.3M | } |
Unexecuted instantiation: crc32_vpclmulqdq.c:fold_4 |
102 | | |
103 | 735k | static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { |
104 | 735k | const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); |
105 | 735k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01); |
106 | 735k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10); |
107 | 735k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01); |
108 | 735k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10); |
109 | 735k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01); |
110 | 735k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10); |
111 | 735k | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01); |
112 | 735k | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10); |
113 | | |
114 | 735k | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); |
115 | 735k | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); |
116 | 735k | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); |
117 | 735k | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); |
118 | 735k | } crc32_pclmulqdq.c:fold_12 Line | Count | Source | 103 | 735k | static inline void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { | 104 | 735k | const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); | 105 | 735k | __m128i x_low0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x01); | 106 | 735k | __m128i x_high0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold12, 0x10); | 107 | 735k | __m128i x_low1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x01); | 108 | 735k | __m128i x_high1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold12, 0x10); | 109 | 735k | __m128i x_low2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x01); | 110 | 735k | __m128i x_high2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold12, 0x10); | 111 | 735k | __m128i x_low3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x01); | 112 | 735k | __m128i x_high3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold12, 0x10); | 113 | | | 114 | 735k | *xmm_crc0 = _mm_xor_si128(x_low0, x_high0); | 115 | 735k | *xmm_crc1 = _mm_xor_si128(x_low1, x_high1); | 116 | 735k | *xmm_crc2 = _mm_xor_si128(x_low2, x_high2); | 117 | 735k | *xmm_crc3 = _mm_xor_si128(x_low3, x_high3); | 118 | 735k | } |
Unexecuted instantiation: crc32_vpclmulqdq.c:fold_12 |
119 | | |
120 | | #ifdef X86_VPCLMULQDQ |
121 | | static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_crc2, __m512i *zmm_crc3, |
122 | 0 | const __m512i zmm_t0, const __m512i zmm_t1, const __m512i zmm_t2, const __m512i zmm_t3, const __m512i zmm_fold16) { |
123 | 0 | __m512i z_low0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x01); |
124 | 0 | __m512i z_high0 = _mm512_clmulepi64_epi128(*zmm_crc0, zmm_fold16, 0x10); |
125 | 0 | __m512i z_low1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x01); |
126 | 0 | __m512i z_high1 = _mm512_clmulepi64_epi128(*zmm_crc1, zmm_fold16, 0x10); |
127 | 0 | __m512i z_low2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x01); |
128 | 0 | __m512i z_high2 = _mm512_clmulepi64_epi128(*zmm_crc2, zmm_fold16, 0x10); |
129 | 0 | __m512i z_low3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x01); |
130 | 0 | __m512i z_high3 = _mm512_clmulepi64_epi128(*zmm_crc3, zmm_fold16, 0x10); |
131 | |
|
132 | 0 | *zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_t0); |
133 | 0 | *zmm_crc1 = z512_xor3_epi64(z_low1, z_high1, zmm_t1); |
134 | 0 | *zmm_crc2 = z512_xor3_epi64(z_low2, z_high2, zmm_t2); |
135 | 0 | *zmm_crc3 = z512_xor3_epi64(z_low3, z_high3, zmm_t3); |
136 | 0 | } |
137 | | #endif |
138 | | |
139 | 4.42M | Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { |
140 | 4.42M | size_t copy_len = len; |
141 | 4.42M | if (len >= 16) { |
142 | | /* Calculate 16-byte alignment offset */ |
143 | 1.19M | uintptr_t align_diff = ALIGN_DIFF(src, 16); |
144 | | |
145 | | /* If total length is less than (alignment bytes + 16), use the faster small method. |
146 | | * Handles both initially small buffers and cases where alignment would leave < 16 bytes */ |
147 | 1.19M | copy_len = len < align_diff + 16 ? len : align_diff; |
148 | 1.19M | } |
149 | | |
150 | 4.42M | if (copy_len > 0) { |
151 | 4.08M | crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY); |
152 | 4.08M | src += copy_len; |
153 | 4.08M | len -= copy_len; |
154 | 4.08M | if (COPY) { |
155 | 4.27k | dst += copy_len; |
156 | 4.27k | } |
157 | 4.08M | } |
158 | | |
159 | 4.42M | if (len == 0) |
160 | 3.23M | return crc; |
161 | | |
162 | 1.18M | const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
163 | | |
164 | 1.18M | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; |
165 | 1.18M | __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); |
166 | 1.18M | __m128i xmm_crc1 = _mm_setzero_si128(); |
167 | 1.18M | __m128i xmm_crc2 = _mm_setzero_si128(); |
168 | 1.18M | __m128i xmm_crc3 = _mm_setzero_si128(); |
169 | | |
170 | 1.18M | if (crc != 0) { |
171 | | // Process the first 16 bytes and handle initial CRC |
172 | 1.01M | len -= 16; |
173 | 1.01M | xmm_t0 = _mm_load_si128((__m128i *)src); |
174 | 1.01M | src += 16; |
175 | | |
176 | 1.01M | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
177 | 1.01M | if (COPY) { |
178 | 14.8k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
179 | 14.8k | dst += 16; |
180 | 14.8k | } |
181 | 1.01M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); |
182 | 1.01M | } |
183 | | |
184 | | #ifdef X86_VPCLMULQDQ |
185 | 0 | if (len >= 256) { |
186 | 0 | len -= 256; |
187 | | |
188 | | __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; |
189 | | __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; |
190 | | __m512i z_low0, z_high0; |
191 | | const __m512i zmm_fold4 = _mm512_set4_epi32( |
192 | | 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
193 | | const __m512i zmm_fold16 = _mm512_set4_epi32( |
194 | | 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); |
195 | | |
196 | | zmm_crc0 = _mm512_loadu_si512((__m512i *)src); |
197 | | zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); |
198 | | zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); |
199 | | zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); |
200 | | src += 256; |
201 | 0 | if (COPY) { |
202 | 0 | _mm512_storeu_si512((__m512i *)dst, zmm_crc0); |
203 | 0 | _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); |
204 | 0 | _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); |
205 | 0 | _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); |
206 | 0 | dst += 256; |
207 | 0 | } |
208 | | |
209 | | // Fold existing xmm state into first 64 bytes |
210 | | zmm_t0 = _mm512_castsi128_si512(xmm_crc0); |
211 | 0 | zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1); |
212 | 0 | zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2); |
213 | 0 | zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3); |
214 | | |
215 | | z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01); |
216 | | z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10); |
217 | 0 | zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0); |
218 | | |
219 | 0 | while (len >= 256) { |
220 | 0 | len -= 256; |
221 | 0 | zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
222 | 0 | zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); |
223 | 0 | zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); |
224 | 0 | zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); |
225 | 0 | src += 256; |
226 | |
|
227 | 0 | fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16); |
228 | 0 | if (COPY) { |
229 | 0 | _mm512_storeu_si512((__m512i *)dst, zmm_t0); |
230 | 0 | _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); |
231 | 0 | _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); |
232 | 0 | _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); |
233 | 0 | dst += 256; |
234 | 0 | } |
235 | 0 | } |
236 | | |
237 | | // zmm_crc[0,1,2,3] -> zmm_crc0 |
238 | | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
239 | | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
240 | 0 | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1); |
241 | | |
242 | | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
243 | | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
244 | 0 | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2); |
245 | | |
246 | | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
247 | | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
248 | 0 | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3); |
249 | | |
250 | | // zmm_crc0 -> xmm_crc[0, 1, 2, 3] |
251 | 0 | xmm_crc0 = z512_extracti64x2(zmm_crc0, 0); |
252 | 0 | xmm_crc1 = z512_extracti64x2(zmm_crc0, 1); |
253 | 0 | xmm_crc2 = z512_extracti64x2(zmm_crc0, 2); |
254 | 0 | xmm_crc3 = z512_extracti64x2(zmm_crc0, 3); |
255 | 0 | } |
256 | | #else |
257 | | /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 |
258 | | * We interleave the PCLMUL-base folds with 8x scaled generator |
259 | | * polynomial copies; we read 8x QWORDS and then XOR them into |
260 | | * the stream at the following offsets: 6, 9, 10, 16, 20, 22, |
261 | | * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper |
262 | | * as "generator_64_bits_unrolled_8" */ |
263 | | #ifndef __AVX512VL__ |
264 | 1.18M | if (!COPY) { |
265 | 1.16M | #endif |
266 | 1.89M | while (len >= 512 + 64 + 16*8) { |
267 | 735k | __m128i chorba8 = _mm_load_si128((__m128i *)src); |
268 | 735k | __m128i chorba7 = _mm_load_si128((__m128i *)src + 1); |
269 | 735k | __m128i chorba6 = _mm_load_si128((__m128i *)src + 2); |
270 | 735k | __m128i chorba5 = _mm_load_si128((__m128i *)src + 3); |
271 | 735k | __m128i chorba4 = _mm_load_si128((__m128i *)src + 4); |
272 | 735k | __m128i chorba3 = _mm_load_si128((__m128i *)src + 5); |
273 | 735k | __m128i chorba2 = _mm_load_si128((__m128i *)src + 6); |
274 | 735k | __m128i chorba1 = _mm_load_si128((__m128i *)src + 7); |
275 | 735k | if (COPY) { |
276 | 0 | _mm_storeu_si128((__m128i *)dst, chorba8); |
277 | 0 | _mm_storeu_si128((__m128i *)dst + 1, chorba7); |
278 | 0 | _mm_storeu_si128((__m128i *)dst + 2, chorba6); |
279 | 0 | _mm_storeu_si128((__m128i *)dst + 3, chorba5); |
280 | 0 | _mm_storeu_si128((__m128i *)dst + 4, chorba4); |
281 | 0 | _mm_storeu_si128((__m128i *)dst + 5, chorba3); |
282 | 0 | _mm_storeu_si128((__m128i *)dst + 6, chorba2); |
283 | 0 | _mm_storeu_si128((__m128i *)dst + 7, chorba1); |
284 | 0 | dst += 16*8; |
285 | 0 | } |
286 | | |
287 | | chorba2 = _mm_xor_si128(chorba2, chorba8); |
288 | | chorba1 = _mm_xor_si128(chorba1, chorba7); |
289 | | src += 16*8; |
290 | | len -= 16*8; |
291 | | |
292 | | xmm_t0 = _mm_load_si128((__m128i *)src); |
293 | | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
294 | | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
295 | | xmm_t3 = _mm_load_si128((__m128i *)src + 3); |
296 | | |
297 | | fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); |
298 | 735k | if (COPY) { |
299 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
300 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
301 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
302 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
303 | 0 | dst += 64; |
304 | 0 | } |
305 | | |
306 | 735k | xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0); |
307 | 735k | xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1); |
308 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2); |
309 | 735k | xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3); |
310 | | |
311 | | xmm_t0 = _mm_load_si128((__m128i *)src + 4); |
312 | | xmm_t1 = _mm_load_si128((__m128i *)src + 5); |
313 | | xmm_t2 = _mm_load_si128((__m128i *)src + 6); |
314 | | xmm_t3 = _mm_load_si128((__m128i *)src + 7); |
315 | | |
316 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
317 | 735k | if (COPY) { |
318 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
319 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
320 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
321 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
322 | 0 | dst += 64; |
323 | 0 | } |
324 | | |
325 | 735k | xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0); |
326 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1); |
327 | 735k | xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2); |
328 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3); |
329 | | |
330 | | xmm_t0 = _mm_load_si128((__m128i *)src + 8); |
331 | | xmm_t1 = _mm_load_si128((__m128i *)src + 9); |
332 | | xmm_t2 = _mm_load_si128((__m128i *)src + 10); |
333 | | xmm_t3 = _mm_load_si128((__m128i *)src + 11); |
334 | | |
335 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
336 | 735k | if (COPY) { |
337 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
338 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
339 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
340 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
341 | 0 | dst += 64; |
342 | 0 | } |
343 | | |
344 | 735k | xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0); |
345 | 735k | xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1); |
346 | 735k | xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2); |
347 | 735k | xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3); |
348 | | |
349 | | xmm_t0 = _mm_load_si128((__m128i *)src + 12); |
350 | | xmm_t1 = _mm_load_si128((__m128i *)src + 13); |
351 | | xmm_t2 = _mm_load_si128((__m128i *)src + 14); |
352 | | xmm_t3 = _mm_load_si128((__m128i *)src + 15); |
353 | | |
354 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
355 | 735k | if (COPY) { |
356 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
357 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
358 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
359 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
360 | 0 | dst += 64; |
361 | 0 | } |
362 | | |
363 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0); |
364 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1); |
365 | 735k | xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2); |
366 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3); |
367 | | |
368 | | xmm_t0 = _mm_load_si128((__m128i *)src + 16); |
369 | | xmm_t1 = _mm_load_si128((__m128i *)src + 17); |
370 | | xmm_t2 = _mm_load_si128((__m128i *)src + 18); |
371 | | xmm_t3 = _mm_load_si128((__m128i *)src + 19); |
372 | | |
373 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
374 | 735k | if (COPY) { |
375 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
376 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
377 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
378 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
379 | 0 | dst += 64; |
380 | 0 | } |
381 | | |
382 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0); |
383 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1); |
384 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2); |
385 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3); |
386 | | |
387 | | xmm_t0 = _mm_load_si128((__m128i *)src + 20); |
388 | | xmm_t1 = _mm_load_si128((__m128i *)src + 21); |
389 | | xmm_t2 = _mm_load_si128((__m128i *)src + 22); |
390 | | xmm_t3 = _mm_load_si128((__m128i *)src + 23); |
391 | | |
392 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
393 | 735k | if (COPY) { |
394 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
395 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
396 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
397 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
398 | 0 | dst += 64; |
399 | 0 | } |
400 | | |
401 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0); |
402 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1); |
403 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2); |
404 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3); |
405 | | |
406 | | xmm_t0 = _mm_load_si128((__m128i *)src + 24); |
407 | | xmm_t1 = _mm_load_si128((__m128i *)src + 25); |
408 | | xmm_t2 = _mm_load_si128((__m128i *)src + 26); |
409 | | xmm_t3 = _mm_load_si128((__m128i *)src + 27); |
410 | | |
411 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
412 | 735k | if (COPY) { |
413 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
414 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
415 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
416 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
417 | 0 | dst += 64; |
418 | 0 | } |
419 | | |
420 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0); |
421 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1); |
422 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2); |
423 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3); |
424 | | |
425 | | xmm_t0 = _mm_load_si128((__m128i *)src + 28); |
426 | | xmm_t1 = _mm_load_si128((__m128i *)src + 29); |
427 | | xmm_t2 = _mm_load_si128((__m128i *)src + 30); |
428 | | xmm_t3 = _mm_load_si128((__m128i *)src + 31); |
429 | | |
430 | | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
431 | 735k | if (COPY) { |
432 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
433 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
434 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
435 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
436 | 0 | dst += 64; |
437 | 0 | } |
438 | | |
439 | 735k | xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0); |
440 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1); |
441 | 735k | xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2); |
442 | 735k | xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3); |
443 | | |
444 | 735k | len -= 512; |
445 | 735k | src += 512; |
446 | 735k | } |
447 | 1.16M | #ifndef __AVX512VL__ |
448 | 1.16M | } |
449 | | #endif |
450 | | |
451 | | #endif /* X86_VPCLMULQDQ */ |
452 | | |
453 | 11.4M | while (len >= 64) { |
454 | 10.2M | len -= 64; |
455 | 10.2M | xmm_t0 = _mm_load_si128((__m128i *)src); |
456 | 10.2M | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
457 | 10.2M | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
458 | 10.2M | xmm_t3 = _mm_load_si128((__m128i *)src + 3); |
459 | 10.2M | src += 64; |
460 | | |
461 | 10.2M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
462 | 10.2M | if (COPY) { |
463 | 9.51M | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
464 | 9.51M | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
465 | 9.51M | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
466 | 9.51M | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); |
467 | 9.51M | dst += 64; |
468 | 9.51M | } |
469 | | |
470 | 10.2M | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); |
471 | 10.2M | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); |
472 | 10.2M | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); |
473 | 10.2M | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); |
474 | 10.2M | } |
475 | | |
476 | | /* |
477 | | * len = num bytes left - 64 |
478 | | */ |
479 | 1.18M | if (len >= 48) { |
480 | 71.3k | len -= 48; |
481 | | |
482 | 71.3k | xmm_t0 = _mm_load_si128((__m128i *)src); |
483 | 71.3k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
484 | 71.3k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); |
485 | 71.3k | src += 48; |
486 | | |
487 | 71.3k | fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
488 | 71.3k | if (COPY) { |
489 | 12.9k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
490 | 12.9k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
491 | 12.9k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); |
492 | 12.9k | dst += 48; |
493 | 12.9k | } |
494 | | |
495 | 71.3k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); |
496 | 71.3k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); |
497 | 71.3k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); |
498 | 1.11M | } else if (len >= 32) { |
499 | 282k | len -= 32; |
500 | | |
501 | 282k | xmm_t0 = _mm_load_si128((__m128i *)src); |
502 | 282k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); |
503 | 282k | src += 32; |
504 | | |
505 | 282k | fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
506 | 282k | if (COPY) { |
507 | 4.49k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
508 | 4.49k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); |
509 | 4.49k | dst += 32; |
510 | 4.49k | } |
511 | | |
512 | 282k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); |
513 | 282k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); |
514 | 834k | } else if (len >= 16) { |
515 | 216k | len -= 16; |
516 | 216k | xmm_t0 = _mm_load_si128((__m128i *)src); |
517 | 216k | src += 16; |
518 | | |
519 | 216k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); |
520 | 216k | if (COPY) { |
521 | 3.29k | _mm_storeu_si128((__m128i *)dst, xmm_t0); |
522 | 3.29k | dst += 16; |
523 | 3.29k | } |
524 | | |
525 | 216k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); |
526 | 216k | } |
527 | | |
528 | 1.18M | const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e); |
529 | 1.18M | const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641); |
530 | | |
531 | | /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */ |
532 | 1.18M | __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01); |
533 | 1.18M | __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10); |
534 | 1.18M | xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0); |
535 | | |
536 | 1.18M | __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01); |
537 | 1.18M | __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10); |
538 | 1.18M | xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1); |
539 | | |
540 | 1.18M | __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01); |
541 | 1.18M | __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10); |
542 | 1.18M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2); |
543 | | |
544 | | /* Fold remaining bytes into the 128-bit state */ |
545 | 1.18M | if (len) { |
546 | 873k | const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); |
547 | 873k | const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); |
548 | | |
549 | | /* Create masks to shift bytes for partial input */ |
550 | 873k | __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16)); |
551 | 873k | __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3); |
552 | | |
553 | | /* Shift out bytes from crc3 to make space for new data */ |
554 | 873k | __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl); |
555 | 873k | xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr); |
556 | | |
557 | | /* Insert the partial input into crc3 */ |
558 | | #if defined(__AVX512BW__) && defined(__AVX512VL__) |
559 | | __mmask16 k = (1 << len) - 1; |
560 | | __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src); |
561 | 0 | if (COPY) { |
562 | 0 | _mm_mask_storeu_epi8(dst, k, xmm_crc_part); |
563 | 0 | } |
564 | | #else |
565 | | __m128i xmm_crc_part = _mm_setzero_si128(); |
566 | | memcpy(&xmm_crc_part, src, len); |
567 | 873k | if (COPY) { |
568 | 11.8k | memcpy(dst, src, len); |
569 | 11.8k | } |
570 | | #endif |
571 | 873k | __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl); |
572 | 873k | xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned); |
573 | | |
574 | | /* Fold the bytes that were shifted out back into crc3 */ |
575 | 873k | __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01); |
576 | 873k | __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10); |
577 | 873k | xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high); |
578 | 873k | } |
579 | | |
580 | | /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */ |
581 | 1.18M | __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00); |
582 | 1.18M | __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10); |
583 | | |
584 | 1.18M | x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf); |
585 | 1.18M | x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3); |
586 | | |
587 | 1.18M | __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01); |
588 | 1.18M | __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10); |
589 | | |
590 | 1.18M | crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2)); |
591 | | |
592 | 1.18M | return ~crc; |
593 | 4.42M | } crc32_pclmulqdq.c:crc32_copy_impl Line | Count | Source | 139 | 4.42M | Z_FORCEINLINE static uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { | 140 | 4.42M | size_t copy_len = len; | 141 | 4.42M | if (len >= 16) { | 142 | | /* Calculate 16-byte alignment offset */ | 143 | 1.19M | uintptr_t align_diff = ALIGN_DIFF(src, 16); | 144 | | | 145 | | /* If total length is less than (alignment bytes + 16), use the faster small method. | 146 | | * Handles both initially small buffers and cases where alignment would leave < 16 bytes */ | 147 | 1.19M | copy_len = len < align_diff + 16 ? len : align_diff; | 148 | 1.19M | } | 149 | | | 150 | 4.42M | if (copy_len > 0) { | 151 | 4.08M | crc = ~crc32_copy_small(~crc, dst, src, copy_len, 31, COPY); | 152 | 4.08M | src += copy_len; | 153 | 4.08M | len -= copy_len; | 154 | 4.08M | if (COPY) { | 155 | 4.27k | dst += copy_len; | 156 | 4.27k | } | 157 | 4.08M | } | 158 | | | 159 | 4.42M | if (len == 0) | 160 | 3.23M | return crc; | 161 | | | 162 | 1.18M | const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); | 163 | | | 164 | 1.18M | __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; | 165 | 1.18M | __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); | 166 | 1.18M | __m128i xmm_crc1 = _mm_setzero_si128(); | 167 | 1.18M | __m128i xmm_crc2 = _mm_setzero_si128(); | 168 | 1.18M | __m128i xmm_crc3 = _mm_setzero_si128(); | 169 | | | 170 | 1.18M | if (crc != 0) { | 171 | | // Process the first 16 bytes and handle initial CRC | 172 | 1.01M | len -= 16; | 173 | 1.01M | xmm_t0 = _mm_load_si128((__m128i *)src); | 174 | 1.01M | src += 16; | 175 | | | 176 | 1.01M | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 177 | 1.01M | if (COPY) { | 178 | 14.8k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 179 | 14.8k | dst += 16; | 180 | 14.8k | } | 181 | 1.01M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, xmm_t0, _mm_cvtsi32_si128(crc)); | 182 | 1.01M | } | 183 | | | 184 | | #ifdef X86_VPCLMULQDQ | 185 | | if (len >= 256) { | 186 | | len -= 256; | 187 | | | 188 | | __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; | 189 | | __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; | 190 | | __m512i z_low0, z_high0; | 191 | | const __m512i zmm_fold4 = _mm512_set4_epi32( | 192 | | 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); | 193 | | const __m512i zmm_fold16 = _mm512_set4_epi32( | 194 | | 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); | 195 | | | 196 | | zmm_crc0 = _mm512_loadu_si512((__m512i *)src); | 197 | | zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); | 198 | | zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); | 199 | | zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); | 200 | | src += 256; | 201 | | if (COPY) { | 202 | | _mm512_storeu_si512((__m512i *)dst, zmm_crc0); | 203 | | _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); | 204 | | _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); | 205 | | _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); | 206 | | dst += 256; | 207 | | } | 208 | | | 209 | | // Fold existing xmm state into first 64 bytes | 210 | | zmm_t0 = _mm512_castsi128_si512(xmm_crc0); | 211 | | zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc1, 1); | 212 | | zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc2, 2); | 213 | | zmm_t0 = z512_inserti64x2(zmm_t0, xmm_crc3, 3); | 214 | | | 215 | | z_low0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x01); | 216 | | z_high0 = _mm512_clmulepi64_epi128(zmm_t0, zmm_fold4, 0x10); | 217 | | zmm_crc0 = z512_xor3_epi64(zmm_crc0, z_low0, z_high0); | 218 | | | 219 | | while (len >= 256) { | 220 | | len -= 256; | 221 | | zmm_t0 = _mm512_loadu_si512((__m512i *)src); | 222 | | zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); | 223 | | zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); | 224 | | zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); | 225 | | src += 256; | 226 | | | 227 | | fold_16(&zmm_crc0, &zmm_crc1, &zmm_crc2, &zmm_crc3, zmm_t0, zmm_t1, zmm_t2, zmm_t3, zmm_fold16); | 228 | | if (COPY) { | 229 | | _mm512_storeu_si512((__m512i *)dst, zmm_t0); | 230 | | _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); | 231 | | _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); | 232 | | _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); | 233 | | dst += 256; | 234 | | } | 235 | | } | 236 | | | 237 | | // zmm_crc[0,1,2,3] -> zmm_crc0 | 238 | | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); | 239 | | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); | 240 | | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc1); | 241 | | | 242 | | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); | 243 | | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); | 244 | | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc2); | 245 | | | 246 | | z_low0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); | 247 | | z_high0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); | 248 | | zmm_crc0 = z512_xor3_epi64(z_low0, z_high0, zmm_crc3); | 249 | | | 250 | | // zmm_crc0 -> xmm_crc[0, 1, 2, 3] | 251 | | xmm_crc0 = z512_extracti64x2(zmm_crc0, 0); | 252 | | xmm_crc1 = z512_extracti64x2(zmm_crc0, 1); | 253 | | xmm_crc2 = z512_extracti64x2(zmm_crc0, 2); | 254 | | xmm_crc3 = z512_extracti64x2(zmm_crc0, 3); | 255 | | } | 256 | | #else | 257 | | /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 | 258 | | * We interleave the PCLMUL-base folds with 8x scaled generator | 259 | | * polynomial copies; we read 8x QWORDS and then XOR them into | 260 | | * the stream at the following offsets: 6, 9, 10, 16, 20, 22, | 261 | | * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper | 262 | | * as "generator_64_bits_unrolled_8" */ | 263 | 1.18M | #ifndef __AVX512VL__ | 264 | 1.18M | if (!COPY) { | 265 | 1.16M | #endif | 266 | 1.89M | while (len >= 512 + 64 + 16*8) { | 267 | 735k | __m128i chorba8 = _mm_load_si128((__m128i *)src); | 268 | 735k | __m128i chorba7 = _mm_load_si128((__m128i *)src + 1); | 269 | 735k | __m128i chorba6 = _mm_load_si128((__m128i *)src + 2); | 270 | 735k | __m128i chorba5 = _mm_load_si128((__m128i *)src + 3); | 271 | 735k | __m128i chorba4 = _mm_load_si128((__m128i *)src + 4); | 272 | 735k | __m128i chorba3 = _mm_load_si128((__m128i *)src + 5); | 273 | 735k | __m128i chorba2 = _mm_load_si128((__m128i *)src + 6); | 274 | 735k | __m128i chorba1 = _mm_load_si128((__m128i *)src + 7); | 275 | 735k | if (COPY) { | 276 | 0 | _mm_storeu_si128((__m128i *)dst, chorba8); | 277 | 0 | _mm_storeu_si128((__m128i *)dst + 1, chorba7); | 278 | 0 | _mm_storeu_si128((__m128i *)dst + 2, chorba6); | 279 | 0 | _mm_storeu_si128((__m128i *)dst + 3, chorba5); | 280 | 0 | _mm_storeu_si128((__m128i *)dst + 4, chorba4); | 281 | 0 | _mm_storeu_si128((__m128i *)dst + 5, chorba3); | 282 | 0 | _mm_storeu_si128((__m128i *)dst + 6, chorba2); | 283 | 0 | _mm_storeu_si128((__m128i *)dst + 7, chorba1); | 284 | 0 | dst += 16*8; | 285 | 0 | } | 286 | | | 287 | 735k | chorba2 = _mm_xor_si128(chorba2, chorba8); | 288 | 735k | chorba1 = _mm_xor_si128(chorba1, chorba7); | 289 | 735k | src += 16*8; | 290 | 735k | len -= 16*8; | 291 | | | 292 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src); | 293 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 294 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 295 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 3); | 296 | | | 297 | 735k | fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); | 298 | 735k | if (COPY) { | 299 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 300 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 301 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 302 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 303 | 0 | dst += 64; | 304 | 0 | } | 305 | | | 306 | 735k | xmm_crc0 = z128_xor3_epi64(xmm_t0, chorba6, xmm_crc0); | 307 | 735k | xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba5, chorba8), xmm_crc1); | 308 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba4, chorba8), chorba7, xmm_crc2); | 309 | 735k | xmm_crc3 = z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba3, chorba7), chorba6, xmm_crc3); | 310 | | | 311 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 4); | 312 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 5); | 313 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 6); | 314 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 7); | 315 | | | 316 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 317 | 735k | if (COPY) { | 318 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 319 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 320 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 321 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 322 | 0 | dst += 64; | 323 | 0 | } | 324 | | | 325 | 735k | xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba6), chorba5, xmm_crc0); | 326 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba4), chorba5, xmm_crc1); | 327 | 735k | xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba3, chorba4), xmm_crc2); | 328 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(xmm_t3, chorba2, chorba3), xmm_crc3); | 329 | | | 330 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 8); | 331 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 9); | 332 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 10); | 333 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 11); | 334 | | | 335 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 336 | 735k | if (COPY) { | 337 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 338 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 339 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 340 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 341 | 0 | dst += 64; | 342 | 0 | } | 343 | | | 344 | 735k | xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba8, xmm_crc0); | 345 | 735k | xmm_crc1 = _mm_xor_si128(z128_xor3_epi64(xmm_t1, chorba1, chorba7), xmm_crc1); | 346 | 735k | xmm_crc2 = z128_xor3_epi64(xmm_t2, chorba6, xmm_crc2); | 347 | 735k | xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba5, xmm_crc3); | 348 | | | 349 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 12); | 350 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 13); | 351 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 14); | 352 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 15); | 353 | | | 354 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 355 | 735k | if (COPY) { | 356 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 357 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 358 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 359 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 360 | 0 | dst += 64; | 361 | 0 | } | 362 | | | 363 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(xmm_t0, chorba4, chorba8), xmm_crc0); | 364 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba8), chorba7, xmm_crc1); | 365 | 735k | xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba8), chorba7, chorba6), xmm_crc2); | 366 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba7), chorba6, chorba5), xmm_crc3); | 367 | | | 368 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 16); | 369 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 17); | 370 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 18); | 371 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 19); | 372 | | | 373 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 374 | 735k | if (COPY) { | 375 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 376 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 377 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 378 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 379 | 0 | dst += 64; | 380 | 0 | } | 381 | | | 382 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba4, chorba8), chorba6, chorba5), xmm_crc0); | 383 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba3, chorba4), chorba8, chorba7), chorba5, xmm_crc1); | 384 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba4, chorba7), chorba6, xmm_crc2); | 385 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba3, chorba8), chorba6, chorba5), xmm_crc3); | 386 | | | 387 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 20); | 388 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 21); | 389 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 22); | 390 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 23); | 391 | | | 392 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 393 | 735k | if (COPY) { | 394 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 395 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 396 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 397 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 398 | 0 | dst += 64; | 399 | 0 | } | 400 | | | 401 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc0); | 402 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba3), chorba4, chorba7), chorba6, xmm_crc1); | 403 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba2, chorba3), chorba8, chorba6), chorba5, xmm_crc2); | 404 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba2), chorba4, chorba8), chorba7, chorba5), xmm_crc3); | 405 | | | 406 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 24); | 407 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 25); | 408 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 26); | 409 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 27); | 410 | | | 411 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 412 | 735k | if (COPY) { | 413 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 414 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 415 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 416 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 417 | 0 | dst += 64; | 418 | 0 | } | 419 | | | 420 | 735k | xmm_crc0 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba1, chorba3), chorba4, chorba8), chorba7, chorba6), xmm_crc0); | 421 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba2, chorba3), chorba7, chorba6), chorba5, xmm_crc1); | 422 | 735k | xmm_crc2 = z128_xor3_epi64(z128_xor3_epi64(z128_xor3_epi64(xmm_t2, chorba1, chorba2), chorba4, chorba6), chorba5, xmm_crc2); | 423 | 735k | xmm_crc3 = _mm_xor_si128(z128_xor3_epi64(z128_xor3_epi64(xmm_t3, chorba1, chorba3), chorba4, chorba5), xmm_crc3); | 424 | | | 425 | 735k | xmm_t0 = _mm_load_si128((__m128i *)src + 28); | 426 | 735k | xmm_t1 = _mm_load_si128((__m128i *)src + 29); | 427 | 735k | xmm_t2 = _mm_load_si128((__m128i *)src + 30); | 428 | 735k | xmm_t3 = _mm_load_si128((__m128i *)src + 31); | 429 | | | 430 | 735k | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 431 | 735k | if (COPY) { | 432 | 0 | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 433 | 0 | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 434 | 0 | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 435 | 0 | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 436 | 0 | dst += 64; | 437 | 0 | } | 438 | | | 439 | 735k | xmm_crc0 = z128_xor3_epi64(z128_xor3_epi64(xmm_t0, chorba2, chorba3), chorba4, xmm_crc0); | 440 | 735k | xmm_crc1 = z128_xor3_epi64(z128_xor3_epi64(xmm_t1, chorba1, chorba2), chorba3, xmm_crc1); | 441 | 735k | xmm_crc2 = _mm_xor_si128(z128_xor3_epi64(xmm_t2, chorba1, chorba2), xmm_crc2); | 442 | 735k | xmm_crc3 = z128_xor3_epi64(xmm_t3, chorba1, xmm_crc3); | 443 | | | 444 | 735k | len -= 512; | 445 | 735k | src += 512; | 446 | 735k | } | 447 | 1.16M | #ifndef __AVX512VL__ | 448 | 1.16M | } | 449 | 1.18M | #endif | 450 | | | 451 | 1.18M | #endif /* X86_VPCLMULQDQ */ | 452 | | | 453 | 11.4M | while (len >= 64) { | 454 | 10.2M | len -= 64; | 455 | 10.2M | xmm_t0 = _mm_load_si128((__m128i *)src); | 456 | 10.2M | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 457 | 10.2M | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 458 | 10.2M | xmm_t3 = _mm_load_si128((__m128i *)src + 3); | 459 | 10.2M | src += 64; | 460 | | | 461 | 10.2M | fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 462 | 10.2M | if (COPY) { | 463 | 9.51M | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 464 | 9.51M | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 465 | 9.51M | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 466 | 9.51M | _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); | 467 | 9.51M | dst += 64; | 468 | 9.51M | } | 469 | | | 470 | 10.2M | xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); | 471 | 10.2M | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); | 472 | 10.2M | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); | 473 | 10.2M | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); | 474 | 10.2M | } | 475 | | | 476 | | /* | 477 | | * len = num bytes left - 64 | 478 | | */ | 479 | 1.18M | if (len >= 48) { | 480 | 71.3k | len -= 48; | 481 | | | 482 | 71.3k | xmm_t0 = _mm_load_si128((__m128i *)src); | 483 | 71.3k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 484 | 71.3k | xmm_t2 = _mm_load_si128((__m128i *)src + 2); | 485 | 71.3k | src += 48; | 486 | | | 487 | 71.3k | fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 488 | 71.3k | if (COPY) { | 489 | 12.9k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 490 | 12.9k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 491 | 12.9k | _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); | 492 | 12.9k | dst += 48; | 493 | 12.9k | } | 494 | | | 495 | 71.3k | xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); | 496 | 71.3k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); | 497 | 71.3k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); | 498 | 1.11M | } else if (len >= 32) { | 499 | 282k | len -= 32; | 500 | | | 501 | 282k | xmm_t0 = _mm_load_si128((__m128i *)src); | 502 | 282k | xmm_t1 = _mm_load_si128((__m128i *)src + 1); | 503 | 282k | src += 32; | 504 | | | 505 | 282k | fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 506 | 282k | if (COPY) { | 507 | 4.49k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 508 | 4.49k | _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); | 509 | 4.49k | dst += 32; | 510 | 4.49k | } | 511 | | | 512 | 282k | xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); | 513 | 282k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); | 514 | 834k | } else if (len >= 16) { | 515 | 216k | len -= 16; | 516 | 216k | xmm_t0 = _mm_load_si128((__m128i *)src); | 517 | 216k | src += 16; | 518 | | | 519 | 216k | fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, xmm_fold4); | 520 | 216k | if (COPY) { | 521 | 3.29k | _mm_storeu_si128((__m128i *)dst, xmm_t0); | 522 | 3.29k | dst += 16; | 523 | 3.29k | } | 524 | | | 525 | 216k | xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); | 526 | 216k | } | 527 | | | 528 | 1.18M | const __m128i k12 = _mm_set_epi32(0x00000001, 0x751997d0, 0x00000000, 0xccaa009e); | 529 | 1.18M | const __m128i barrett_k = _mm_set_epi32(0x00000001, 0xdb710640, 0xb4e5b025, 0xf7011641); | 530 | | | 531 | | /* Fold 4x128-bit into a single 128-bit value using k1/k2 constants */ | 532 | 1.18M | __m128i x_low0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x01); | 533 | 1.18M | __m128i x_high0 = _mm_clmulepi64_si128(xmm_crc0, k12, 0x10); | 534 | 1.18M | xmm_crc1 = z128_xor3_epi64(xmm_crc1, x_low0, x_high0); | 535 | | | 536 | 1.18M | __m128i x_low1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x01); | 537 | 1.18M | __m128i x_high1 = _mm_clmulepi64_si128(xmm_crc1, k12, 0x10); | 538 | 1.18M | xmm_crc2 = z128_xor3_epi64(xmm_crc2, x_low1, x_high1); | 539 | | | 540 | 1.18M | __m128i x_low2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x01); | 541 | 1.18M | __m128i x_high2 = _mm_clmulepi64_si128(xmm_crc2, k12, 0x10); | 542 | 1.18M | xmm_crc3 = z128_xor3_epi64(xmm_crc3, x_low2, x_high2); | 543 | | | 544 | | /* Fold remaining bytes into the 128-bit state */ | 545 | 1.18M | if (len) { | 546 | 873k | const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080); | 547 | 873k | const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); | 548 | | | 549 | | /* Create masks to shift bytes for partial input */ | 550 | 873k | __m128i xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)len - 16)); | 551 | 873k | __m128i xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3); | 552 | | | 553 | | /* Shift out bytes from crc3 to make space for new data */ | 554 | 873k | __m128i xmm_overflow = _mm_shuffle_epi8(xmm_crc3, xmm_shl); | 555 | 873k | xmm_crc3 = _mm_shuffle_epi8(xmm_crc3, xmm_shr); | 556 | | | 557 | | /* Insert the partial input into crc3 */ | 558 | | #if defined(__AVX512BW__) && defined(__AVX512VL__) | 559 | | __mmask16 k = (1 << len) - 1; | 560 | | __m128i xmm_crc_part = _mm_maskz_loadu_epi8(k, src); | 561 | | if (COPY) { | 562 | | _mm_mask_storeu_epi8(dst, k, xmm_crc_part); | 563 | | } | 564 | | #else | 565 | 873k | __m128i xmm_crc_part = _mm_setzero_si128(); | 566 | 873k | memcpy(&xmm_crc_part, src, len); | 567 | 873k | if (COPY) { | 568 | 11.8k | memcpy(dst, src, len); | 569 | 11.8k | } | 570 | 873k | #endif | 571 | 873k | __m128i part_aligned = _mm_shuffle_epi8(xmm_crc_part, xmm_shl); | 572 | 873k | xmm_crc3 = _mm_xor_si128(xmm_crc3, part_aligned); | 573 | | | 574 | | /* Fold the bytes that were shifted out back into crc3 */ | 575 | 873k | __m128i ovf_low = _mm_clmulepi64_si128(xmm_overflow, k12, 0x01); | 576 | 873k | __m128i ovf_high = _mm_clmulepi64_si128(xmm_overflow, k12, 0x10); | 577 | 873k | xmm_crc3 = z128_xor3_epi64(xmm_crc3, ovf_low, ovf_high); | 578 | 873k | } | 579 | | | 580 | | /* Reduce 128-bits to 32-bits using two-stage Barrett reduction */ | 581 | 1.18M | __m128i x_tmp0 = _mm_clmulepi64_si128(xmm_crc3, barrett_k, 0x00); | 582 | 1.18M | __m128i x_tmp1 = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x10); | 583 | | | 584 | 1.18M | x_tmp1 = _mm_blend_epi16(x_tmp1, _mm_setzero_si128(), 0xcf); | 585 | 1.18M | x_tmp0 = _mm_xor_si128(x_tmp1, xmm_crc3); | 586 | | | 587 | 1.18M | __m128i x_res_a = _mm_clmulepi64_si128(x_tmp0, barrett_k, 0x01); | 588 | 1.18M | __m128i x_res_b = _mm_clmulepi64_si128(x_res_a, barrett_k, 0x10); | 589 | | | 590 | 1.18M | crc = ((uint32_t)_mm_extract_epi32(x_res_b, 2)); | 591 | | | 592 | 1.18M | return ~crc; | 593 | 4.42M | } |
Unexecuted instantiation: crc32_vpclmulqdq.c:crc32_copy_impl |