/src/zlib-ng/arch/x86/chorba_sse2.c
Line | Count | Source |
1 | | #if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) |
2 | | |
3 | | #include "zbuild.h" |
4 | | #include "crc32_braid_p.h" |
5 | | #include "crc32_braid_tbl.h" |
6 | | #include "crc32.h" |
7 | | #include <emmintrin.h> |
8 | | #include "arch/x86/x86_intrins.h" |
9 | | #include "arch/generic/generic_functions.h" |
10 | | #include <assert.h> |
11 | | |
12 | | uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len); |
13 | | |
14 | 0 | #define READ_NEXT(in, off, a, b) do { \ |
15 | 0 | a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ |
16 | 0 | b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \ |
17 | 0 | } while (0); |
18 | | |
19 | 0 | #define NEXT_ROUND(invec, a, b, c, d) do { \ |
20 | 0 | a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \ |
21 | 0 | b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \ |
22 | 0 | c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \ |
23 | 0 | d = _mm_srli_epi64(invec, 20); \ |
24 | 0 | } while (0); |
25 | | |
26 | 0 | Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint64_t* buf, size_t len) { |
27 | 0 | const uint64_t* input = buf; |
28 | 0 | ALIGNED_(16) uint64_t final[9] = {0}; |
29 | 0 | uint64_t next1 = crc; |
30 | 0 | crc = 0; |
31 | 0 | uint64_t next2 = 0; |
32 | 0 | uint64_t next3 = 0; |
33 | 0 | uint64_t next4 = 0; |
34 | 0 | uint64_t next5 = 0; |
35 | |
|
36 | 0 | __m128i next12 = _mm_cvtsi64_si128(next1); |
37 | 0 | __m128i next34 = _mm_setzero_si128(); |
38 | 0 | __m128i next56 = _mm_setzero_si128(); |
39 | 0 | __m128i ab1, ab2, ab3, ab4, cd1, cd2, cd3, cd4; |
40 | |
|
41 | 0 | size_t i = 0; |
42 | | |
43 | | /* This is weird, doing for vs while drops 10% off the exec time */ |
44 | 0 | for(; (i + 256 + 40 + 32 + 32) < len; i += 32) { |
45 | 0 | __m128i in1in2, in3in4; |
46 | | |
47 | | /* |
48 | | uint64_t chorba1 = input[i / sizeof(uint64_t)]; |
49 | | uint64_t chorba2 = input[i / sizeof(uint64_t) + 1]; |
50 | | uint64_t chorba3 = input[i / sizeof(uint64_t) + 2]; |
51 | | uint64_t chorba4 = input[i / sizeof(uint64_t) + 3]; |
52 | | uint64_t chorba5 = input[i / sizeof(uint64_t) + 4]; |
53 | | uint64_t chorba6 = input[i / sizeof(uint64_t) + 5]; |
54 | | uint64_t chorba7 = input[i / sizeof(uint64_t) + 6]; |
55 | | uint64_t chorba8 = input[i / sizeof(uint64_t) + 7]; |
56 | | */ |
57 | |
|
58 | 0 | const uint64_t *inputPtr = input + (i / sizeof(uint64_t)); |
59 | 0 | const __m128i *inputPtr128 = (__m128i*)inputPtr; |
60 | 0 | __m128i chorba12 = _mm_load_si128(inputPtr128++); |
61 | 0 | __m128i chorba34 = _mm_load_si128(inputPtr128++); |
62 | 0 | __m128i chorba56 = _mm_load_si128(inputPtr128++); |
63 | 0 | __m128i chorba78 = _mm_load_si128(inputPtr128++); |
64 | |
|
65 | 0 | chorba12 = _mm_xor_si128(chorba12, next12); |
66 | 0 | chorba34 = _mm_xor_si128(chorba34, next34); |
67 | 0 | chorba56 = _mm_xor_si128(chorba56, next56); |
68 | 0 | chorba78 = _mm_xor_si128(chorba78, chorba12); |
69 | 0 | __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1)); |
70 | 0 | __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12), |
71 | 0 | _mm_castsi128_pd(chorba34), 1)); |
72 | | /* |
73 | | chorba1 ^= next1; |
74 | | chorba2 ^= next2; |
75 | | chorba3 ^= next3; |
76 | | chorba4 ^= next4; |
77 | | chorba5 ^= next5; |
78 | | chorba7 ^= chorba1; |
79 | | chorba8 ^= chorba2; |
80 | | */ |
81 | 0 | i += 8 * 8; |
82 | | |
83 | | /* 0-3 */ |
84 | | /*in1 = input[i / sizeof(uint64_t)]; |
85 | | in2 = input[i / sizeof(uint64_t) + 1];*/ |
86 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
87 | 0 | __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12)); |
88 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba34xor); |
89 | | /* |
90 | | in1 ^= chorba3; |
91 | | in2 ^= chorba4 ^ chorba1; |
92 | | */ |
93 | |
|
94 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
95 | | /* |
96 | | a1 = (in1 << 17) ^ (in1 << 55); |
97 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
98 | | a3 = (in1 >> 45) ^ (in1 << 44); |
99 | | a4 = (in1 >> 20); |
100 | | |
101 | | b1 = (in2 << 17) ^ (in2 << 55); |
102 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
103 | | b3 = (in2 >> 45) ^ (in2 << 44); |
104 | | b4 = (in2 >> 20); |
105 | | |
106 | | */ |
107 | |
|
108 | 0 | in3in4 = _mm_xor_si128(in3in4, ab1); |
109 | | /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */ |
110 | 0 | __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)); |
111 | |
|
112 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23)); |
113 | 0 | in3in4 = _mm_xor_si128(in3in4, chorba12); |
114 | |
|
115 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
116 | | /* |
117 | | in3 = input[i / sizeof(uint64_t) + 2]; |
118 | | in4 = input[i / sizeof(uint64_t) + 3]; |
119 | | in3 ^= a1 ^ chorba5 ^ chorba2 ^ chorba1; |
120 | | in4 ^= b1 ^a2 ^ chorba6 ^ chorba3 ^ chorba2; |
121 | | |
122 | | c1 = (in3 << 17) ^ (in3 << 55); |
123 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
124 | | c3 = (in3 >> 45) ^ (in3 << 44); |
125 | | c4 = (in3 >> 20); |
126 | | |
127 | | d1 = (in4 << 17) ^ (in4 << 55); |
128 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
129 | | d3 = (in4 >> 45) ^ (in4 << 44); |
130 | | d4 = (in4 >> 20); |
131 | | */ |
132 | |
|
133 | 0 | __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
134 | 0 | __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); |
135 | 0 | a4_ = _mm_xor_si128(b2c2, a4_); |
136 | 0 | next12 = _mm_xor_si128(ab3, a4_); |
137 | 0 | next12 = _mm_xor_si128(next12, cd1); |
138 | |
|
139 | 0 | __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
140 | 0 | __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
141 | | |
142 | | /*out1 = a3 ^ b2 ^ c1; |
143 | | out2 = b3 ^ c2 ^ d1 ^ a4;*/ |
144 | 0 | next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); |
145 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
146 | | |
147 | | //out3 = b4 ^ c3 ^ d2; |
148 | | //out4 = c4 ^ d3; |
149 | | |
150 | | //out5 = d4; |
151 | | |
152 | | /* |
153 | | next1 = out1; |
154 | | next2 = out2; |
155 | | next3 = out3; |
156 | | next4 = out4; |
157 | | next5 = out5; |
158 | | */ |
159 | |
|
160 | 0 | i += 32; |
161 | | |
162 | | /* 4-7 */ |
163 | | /*in1 = input[i / sizeof(uint64_t)]; |
164 | | in2 = input[i / sizeof(uint64_t) + 1];*/ |
165 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
166 | |
|
167 | 0 | in1in2 = _mm_xor_si128(in1in2, next12); |
168 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba78); |
169 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba45); |
170 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba34); |
171 | | |
172 | | /* |
173 | | in1 ^= next1 ^ chorba7 ^ chorba4 ^ chorba3; |
174 | | in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba4; |
175 | | */ |
176 | | |
177 | | /* |
178 | | a1 = (in1 << 17) ^ (in1 << 55); |
179 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
180 | | a3 = (in1 >> 45) ^ (in1 << 44); |
181 | | a4 = (in1 >> 20); |
182 | | |
183 | | b1 = (in2 << 17) ^ (in2 << 55); |
184 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
185 | | b3 = (in2 >> 45) ^ (in2 << 44); |
186 | | b4 = (in2 >> 20); |
187 | | */ |
188 | |
|
189 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
190 | | |
191 | | /* |
192 | | in3 = input[i / sizeof(uint64_t) + 2]; |
193 | | in4 = input[i / sizeof(uint64_t) + 3]; |
194 | | |
195 | | in3 ^= next3 ^ a1 ^ chorba6 ^ chorba5; |
196 | | in4 ^= next4 ^ b1 ^ a2 ^ chorba7 ^ chorba6; |
197 | | */ |
198 | 0 | in3in4 = _mm_xor_si128(in3in4, next34); |
199 | 0 | in3in4 = _mm_xor_si128(in3in4, ab1); |
200 | 0 | in3in4 = _mm_xor_si128(in3in4, chorba56); |
201 | 0 | __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1)); |
202 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); |
203 | | |
204 | | /* |
205 | | c1 = (in3 << 17) ^ (in3 << 55); |
206 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
207 | | c3 = (in3 >> 45) ^ (in3 << 44); |
208 | | c4 = (in3 >> 20); |
209 | | |
210 | | d1 = (in4 << 17) ^ (in4 << 55); |
211 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
212 | | d3 = (in4 >> 45) ^ (in4 << 44); |
213 | | d4 = (in4 >> 20); |
214 | | */ |
215 | |
|
216 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
217 | | |
218 | | ///* |
219 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
220 | 0 | a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); |
221 | 0 | a4_ = _mm_xor_si128(b2c2, a4_); |
222 | 0 | next12 = _mm_xor_si128(ab3, cd1); |
223 | |
|
224 | 0 | next12 = _mm_xor_si128(next12, a4_); |
225 | 0 | next12 = _mm_xor_si128(next12, next56); |
226 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
227 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
228 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
229 | 0 | next34 = _mm_xor_si128(next34, d2_); |
230 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
231 | | //*/ |
232 | | |
233 | | /* |
234 | | out1 = a3 ^ b2 ^ c1; |
235 | | out2 = b3 ^ c2 ^ d1 ^ a4; |
236 | | out3 = b4 ^ c3 ^ d2; |
237 | | out4 = c4 ^ d3; |
238 | | out5 = d4; |
239 | | |
240 | | next1 = next5 ^ out1; |
241 | | next2 = out2; |
242 | | next3 = out3; |
243 | | next4 = out4; |
244 | | next5 = out5; |
245 | | */ |
246 | |
|
247 | 0 | i += 32; |
248 | | |
249 | | /* 8-11 */ |
250 | | /* |
251 | | in1 = input[i / sizeof(uint64_t)]; |
252 | | in2 = input[i / sizeof(uint64_t) + 1]; |
253 | | in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba1; |
254 | | in2 ^= next2 ^ chorba8 ^ chorba2; |
255 | | */ |
256 | |
|
257 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
258 | |
|
259 | 0 | __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128()); |
260 | 0 | __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12); |
261 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba80); |
262 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba78); |
263 | 0 | in1in2 = _mm_xor_si128(in1in2, next12_chorba12); |
264 | |
|
265 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
266 | | |
267 | | /* |
268 | | a1 = (in1 << 17) ^ (in1 << 55); |
269 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
270 | | a3 = (in1 >> 45) ^ (in1 << 44); |
271 | | a4 = (in1 >> 20); |
272 | | |
273 | | b1 = (in2 << 17) ^ (in2 << 55); |
274 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
275 | | b3 = (in2 >> 45) ^ (in2 << 44); |
276 | | b4 = (in2 >> 20); |
277 | | */ |
278 | | |
279 | | /*in3 = input[i / sizeof(uint64_t) + 2]; |
280 | | in4 = input[i / sizeof(uint64_t) + 3];*/ |
281 | 0 | in3in4 = _mm_xor_si128(next34, in3in4); |
282 | 0 | in3in4 = _mm_xor_si128(in3in4, ab1); |
283 | 0 | __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); |
284 | 0 | in3in4 = _mm_xor_si128(in3in4, chorba34); |
285 | 0 | in3in4 = _mm_xor_si128(in3in4, a2_); |
286 | | |
287 | | /* |
288 | | in3 ^= next3 ^ a1 ^ chorba3; |
289 | | in4 ^= next4 ^ a2 ^ b1 ^ chorba4; |
290 | | |
291 | | c1 = (in3 << 17) ^ (in3 << 55); |
292 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
293 | | c3 = (in3 >> 45) ^ (in3 << 44); |
294 | | c4 = (in3 >> 20); |
295 | | |
296 | | d1 = (in4 << 17) ^ (in4 << 55); |
297 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
298 | | d3 = (in4 >> 45) ^ (in4 << 44); |
299 | | d4 = (in4 >> 20); |
300 | | */ |
301 | | |
302 | |
|
303 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
304 | |
|
305 | 0 | a4_ = _mm_unpacklo_epi64(next56, ab4); |
306 | 0 | next12 = _mm_xor_si128(a4_, ab3); |
307 | 0 | next12 = _mm_xor_si128(next12, cd1); |
308 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
309 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
310 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
311 | 0 | next12 = _mm_xor_si128(next12, b2c2); |
312 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
313 | 0 | next34 = _mm_xor_si128(next34, d2_); |
314 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
315 | | |
316 | | /* |
317 | | out1 = a3 ^ b2 ^ c1; |
318 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
319 | | out3 = b4 ^ c3 ^ d2; |
320 | | out4 = c4 ^ d3; |
321 | | out5 = d4; |
322 | | |
323 | | next1 = next5 ^ out1; |
324 | | next2 = out2; |
325 | | next3 = out3; |
326 | | next4 = out4; |
327 | | next5 = out5; |
328 | | */ |
329 | |
|
330 | 0 | i += 32; |
331 | | |
332 | | /* 12-15 */ |
333 | | /* |
334 | | in1 = input[i / sizeof(uint64_t)]; |
335 | | in2 = input[i / sizeof(uint64_t) + 1]; |
336 | | */ |
337 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
338 | 0 | in1in2 = _mm_xor_si128(in1in2, next12); |
339 | 0 | __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12); |
340 | 0 | in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12); |
341 | 0 | __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); |
342 | 0 | in1in2 = _mm_xor_si128(in1in2, chorb1_); |
343 | | |
344 | | |
345 | | /* |
346 | | in1 ^= next1 ^ chorba5 ^ chorba1; |
347 | | in2 ^= next2 ^ chorba6 ^ chorba2 ^ chorba1; |
348 | | |
349 | | a1 = (in1 << 17) ^ (in1 << 55); |
350 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
351 | | a3 = (in1 >> 45) ^ (in1 << 44); |
352 | | a4 = (in1 >> 20); |
353 | | |
354 | | b1 = (in2 << 17) ^ (in2 << 55); |
355 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
356 | | b3 = (in2 >> 45) ^ (in2 << 44); |
357 | | b4 = (in2 >> 20); |
358 | | */ |
359 | |
|
360 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
361 | | |
362 | | /* |
363 | | in3 = input[i / sizeof(uint64_t) + 2]; |
364 | | in4 = input[i / sizeof(uint64_t) + 3]; |
365 | | in3 ^= next3 ^ a1 ^ chorba7 ^ chorba3 ^ chorba2 ^ chorba1; |
366 | | in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba4 ^ chorba3 ^ chorba2; |
367 | | */ |
368 | |
|
369 | 0 | in3in4 = _mm_xor_si128(next34, in3in4); |
370 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); |
371 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12)); |
372 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2))); |
373 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
374 | | |
375 | | /* |
376 | | |
377 | | c1 = (in3 << 17) ^ (in3 << 55); |
378 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
379 | | c3 = (in3 >> 45) ^ (in3 << 44); |
380 | | c4 = (in3 >> 20); |
381 | | |
382 | | d1 = (in4 << 17) ^ (in4 << 55); |
383 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
384 | | d3 = (in4 >> 45) ^ (in4 << 44); |
385 | | d4 = (in4 >> 20); |
386 | | */ |
387 | | |
388 | | ///* |
389 | 0 | a4_ = _mm_unpacklo_epi64(next56, ab4); |
390 | 0 | next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); |
391 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
392 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
393 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
394 | 0 | next12 = _mm_xor_si128(next12, b2c2); |
395 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
396 | 0 | next34 = _mm_xor_si128(next34, d2_); |
397 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
398 | | //*/ |
399 | | |
400 | | /* |
401 | | out1 = a3 ^ b2 ^ c1; |
402 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
403 | | out3 = b4 ^ c3 ^ d2; |
404 | | out4 = c4 ^ d3; |
405 | | out5 = d4; |
406 | | |
407 | | next1 = next5 ^ out1; |
408 | | next2 = out2; |
409 | | next3 = out3; |
410 | | next4 = out4; |
411 | | next5 = out5; |
412 | | */ |
413 | |
|
414 | 0 | i += 32; |
415 | | |
416 | | /* 16-19 */ |
417 | | /* |
418 | | in1 = input[i / sizeof(uint64_t)]; |
419 | | in2 = input[i / sizeof(uint64_t) + 1]; |
420 | | in1 ^= next1 ^ chorba5 ^ chorba4 ^ chorba3 ^ chorba1; |
421 | | in2 ^= next2 ^ chorba6 ^ chorba5 ^ chorba4 ^ chorba1 ^ chorba2; |
422 | | */ |
423 | | ///* |
424 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
425 | 0 | __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12); |
426 | 0 | in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45)); |
427 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34)); |
428 | 0 | in1in2 = _mm_xor_si128(chorba1_, in1in2); |
429 | |
|
430 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
431 | | //*/ |
432 | | |
433 | | /* |
434 | | a1 = (in1 << 17) ^ (in1 << 55); |
435 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
436 | | a3 = (in1 >> 45) ^ (in1 << 44); |
437 | | a4 = (in1 >> 20); |
438 | | |
439 | | b1 = (in2 << 17) ^ (in2 << 55); |
440 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
441 | | b3 = (in2 >> 45) ^ (in2 << 44); |
442 | | b4 = (in2 >> 20); |
443 | | */ |
444 | | |
445 | | /* |
446 | | in3 = input[i / sizeof(uint64_t) + 2]; |
447 | | in4 = input[i / sizeof(uint64_t) + 3]; |
448 | | */ |
449 | | ///* |
450 | 0 | a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); |
451 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78)); |
452 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34)); |
453 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67)); |
454 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_)); |
455 | 0 | in3in4 = _mm_xor_si128(in3in4, next34); |
456 | | //*/ |
457 | | /* |
458 | | in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba5 ^ chorba2 ^ chorba3; |
459 | | in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba6 ^ chorba3 ^ chorba4 ^ chorba1; |
460 | | */ |
461 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
462 | | |
463 | | /* |
464 | | c1 = (in3 << 17) ^ (in3 << 55); |
465 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
466 | | c3 = (in3 >> 45) ^ (in3 << 44); |
467 | | c4 = (in3 >> 20); |
468 | | |
469 | | d1 = (in4 << 17) ^ (in4 << 55); |
470 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
471 | | d3 = (in4 >> 45) ^ (in4 << 44); |
472 | | d4 = (in4 >> 20); |
473 | | */ |
474 | |
|
475 | 0 | a4_ = _mm_unpacklo_epi64(next56, ab4); |
476 | 0 | next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); |
477 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
478 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
479 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
480 | 0 | next12 = _mm_xor_si128(next12, b2c2); |
481 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
482 | 0 | next34 = _mm_xor_si128(next34, d2_); |
483 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
484 | | |
485 | | /* |
486 | | out1 = a3 ^ b2 ^ c1; |
487 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
488 | | out3 = b4 ^ c3 ^ d2; |
489 | | out4 = c4 ^ d3; |
490 | | out5 = d4; |
491 | | |
492 | | next1 = next5 ^ out1; |
493 | | next2 = out2; |
494 | | next3 = out3; |
495 | | next4 = out4; |
496 | | next5 = out5; |
497 | | */ |
498 | |
|
499 | 0 | i += 32; |
500 | | |
501 | | /* 20-23 */ |
502 | | /* |
503 | | in1 = input[i / sizeof(uint64_t)]; |
504 | | in2 = input[i / sizeof(uint64_t) + 1]; |
505 | | in1 ^= next1 ^ chorba8 ^ chorba7 ^ chorba4 ^ chorba5 ^ chorba2 ^ chorba1; |
506 | | in2 ^= next2 ^ chorba8 ^ chorba5 ^ chorba6 ^ chorba3 ^ chorba2; |
507 | | */ |
508 | |
|
509 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
510 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78)); |
511 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba45, chorba56)); |
512 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12)); |
513 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba80); |
514 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
515 | | |
516 | | /* |
517 | | a1 = (in1 << 17) ^ (in1 << 55); |
518 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
519 | | a3 = (in1 >> 45) ^ (in1 << 44); |
520 | | a4 = (in1 >> 20); |
521 | | |
522 | | b1 = (in2 << 17) ^ (in2 << 55); |
523 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
524 | | b3 = (in2 >> 45) ^ (in2 << 44); |
525 | | b4 = (in2 >> 20); |
526 | | */ |
527 | | |
528 | | /* |
529 | | in3 = input[i / sizeof(uint64_t) + 2]; |
530 | | in4 = input[i / sizeof(uint64_t) + 3]; |
531 | | in3 ^= next3 ^ a1 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba1; |
532 | | in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba2 ^ chorba1; |
533 | | */ |
534 | 0 | a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); |
535 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); |
536 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67)); |
537 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); |
538 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba1_, a2_)); |
539 | 0 | in3in4 = _mm_xor_si128(in3in4, chorba12); |
540 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
541 | | |
542 | | /* |
543 | | c1 = (in3 << 17) ^ (in3 << 55); |
544 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
545 | | c3 = (in3 >> 45) ^ (in3 << 44); |
546 | | c4 = (in3 >> 20); |
547 | | |
548 | | d1 = (in4 << 17) ^ (in4 << 55); |
549 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
550 | | d3 = (in4 >> 45) ^ (in4 << 44); |
551 | | d4 = (in4 >> 20); |
552 | | */ |
553 | | |
554 | | /* |
555 | | out1 = a3 ^ b2 ^ c1; |
556 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
557 | | out3 = b4 ^ c3 ^ d2; |
558 | | out4 = c4 ^ d3; |
559 | | out5 = d4; |
560 | | |
561 | | next1 = next5 ^ out1; |
562 | | next2 = out2; |
563 | | next3 = out3; |
564 | | next4 = out4; |
565 | | next5 = out5; |
566 | | */ |
567 | |
|
568 | 0 | a4_ = _mm_unpacklo_epi64(next56, ab4); |
569 | 0 | next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); |
570 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
571 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
572 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
573 | 0 | next12 = _mm_xor_si128(next12, b2c2); |
574 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
575 | 0 | next34 = _mm_xor_si128(next34, d2_); |
576 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
577 | |
|
578 | 0 | i += 32; |
579 | | |
580 | | /* 24-27 */ |
581 | | /* |
582 | | in1 = input[i / sizeof(uint64_t)]; |
583 | | in2 = input[i / sizeof(uint64_t) + 1]; |
584 | | in1 ^= next1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba3 ^ chorba2 ^ chorba1; |
585 | | in2 ^= next2 ^ chorba7 ^ chorba6 ^ chorba4 ^ chorba3 ^ chorba2; |
586 | | */ |
587 | |
|
588 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
589 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba67)); |
590 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba56, chorba34)); |
591 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba23, chorba12)); |
592 | 0 | in1in2 = _mm_xor_si128(in1in2, chorba80); |
593 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
594 | | |
595 | | /* |
596 | | a1 = (in1 << 17) ^ (in1 << 55); |
597 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
598 | | a3 = (in1 >> 45) ^ (in1 << 44); |
599 | | a4 = (in1 >> 20); |
600 | | |
601 | | b1 = (in2 << 17) ^ (in2 << 55); |
602 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
603 | | b3 = (in2 >> 45) ^ (in2 << 44); |
604 | | b4 = (in2 >> 20); |
605 | | */ |
606 | | |
607 | | /*in3 = input[i / sizeof(uint64_t) + 2]; |
608 | | in4 = input[i / sizeof(uint64_t) + 3]; |
609 | | in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7 ^ chorba5 ^ chorba4 ^ chorba3; |
610 | | in4 ^= next4 ^ a2 ^ b1 ^ chorba8 ^ chorba6 ^ chorba5 ^ chorba4; |
611 | | |
612 | | c1 = (in3 << 17) ^ (in3 << 55); |
613 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
614 | | c3 = (in3 >> 45) ^ (in3 << 44); |
615 | | c4 = (in3 >> 20); |
616 | | |
617 | | d1 = (in4 << 17) ^ (in4 << 55); |
618 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
619 | | d3 = (in4 >> 45) ^ (in4 << 44); |
620 | | d4 = (in4 >> 20); |
621 | | */ |
622 | 0 | a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); |
623 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); |
624 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56)); |
625 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34)); |
626 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba80, a2_)); |
627 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
628 | |
|
629 | 0 | a4_ = _mm_unpacklo_epi64(next56, ab4); |
630 | 0 | next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); |
631 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
632 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
633 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
634 | 0 | next12 = _mm_xor_si128(next12, b2c2); |
635 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
636 | 0 | next34 = _mm_xor_si128(next34, d2_); |
637 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
638 | | |
639 | | /* |
640 | | out1 = a3 ^ b2 ^ c1; |
641 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
642 | | out3 = b4 ^ c3 ^ d2; |
643 | | out4 = c4 ^ d3; |
644 | | out5 = d4; |
645 | | |
646 | | next1 = next5 ^ out1; |
647 | | next2 = out2; |
648 | | next3 = out3; |
649 | | next4 = out4; |
650 | | next5 = out5; |
651 | | */ |
652 | 0 | i += 32; |
653 | | |
654 | | /* 28-31 */ |
655 | | /* |
656 | | in1 = input[i / sizeof(uint64_t)]; |
657 | | in2 = input[i / sizeof(uint64_t) + 1]; |
658 | | in1 ^= next1 ^ chorba7 ^ chorba6 ^ chorba5; |
659 | | in2 ^= next2 ^ chorba8 ^ chorba7 ^ chorba6; |
660 | | */ |
661 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
662 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(next12, chorba78)); |
663 | 0 | in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba67, chorba56)); |
664 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
665 | | |
666 | | /* |
667 | | a1 = (in1 << 17) ^ (in1 << 55); |
668 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
669 | | a3 = (in1 >> 45) ^ (in1 << 44); |
670 | | a4 = (in1 >> 20); |
671 | | |
672 | | b1 = (in2 << 17) ^ (in2 << 55); |
673 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
674 | | b3 = (in2 >> 45) ^ (in2 << 44); |
675 | | b4 = (in2 >> 20); |
676 | | */ |
677 | | |
678 | | /* |
679 | | in3 = input[i / sizeof(uint64_t) + 2]; |
680 | | in4 = input[i / sizeof(uint64_t) + 3]; |
681 | | in3 ^= next3 ^ a1 ^ chorba8 ^ chorba7; |
682 | | in4 ^= next4 ^ a2 ^ b1 ^ chorba8; |
683 | | |
684 | | c1 = (in3 << 17) ^ (in3 << 55); |
685 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
686 | | c3 = (in3 >> 45) ^ (in3 << 44); |
687 | | c4 = (in3 >> 20); |
688 | | |
689 | | d1 = (in4 << 17) ^ (in4 << 55); |
690 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
691 | | d3 = (in4 >> 45) ^ (in4 << 44); |
692 | | d4 = (in4 >> 20); |
693 | | */ |
694 | 0 | a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); |
695 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1)); |
696 | 0 | in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80)); |
697 | 0 | in3in4 = _mm_xor_si128(a2_, in3in4); |
698 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
699 | | |
700 | | /* |
701 | | out1 = a3 ^ b2 ^ c1; |
702 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
703 | | out3 = b4 ^ c3 ^ d2; |
704 | | out4 = c4 ^ d3; |
705 | | out5 = d4; |
706 | | */ |
707 | | |
708 | | /* |
709 | | next1 = next5 ^ out1; |
710 | | next2 = out2; |
711 | | next3 = out3; |
712 | | next4 = out4; |
713 | | next5 = out5; |
714 | | */ |
715 | |
|
716 | 0 | a4_ = _mm_unpacklo_epi64(next56, ab4); |
717 | 0 | next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); |
718 | 0 | b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
719 | 0 | b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
720 | 0 | d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
721 | 0 | next12 = _mm_xor_si128(next12, b2c2); |
722 | 0 | next34 = _mm_xor_si128(b4c4, cd3); |
723 | 0 | next34 = _mm_xor_si128(next34, d2_); |
724 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
725 | 0 | } |
726 | |
|
727 | 0 | for(; (i + 40 + 32) < len; i += 32) { |
728 | 0 | __m128i in1in2, in3in4; |
729 | | |
730 | | /*in1 = input[i / sizeof(uint64_t)]; |
731 | | in2 = input[i / sizeof(uint64_t) + 1];*/ |
732 | | //READ_NEXT_UNALIGNED(input, i, in1in2, in3in4); |
733 | 0 | READ_NEXT(input, i, in1in2, in3in4); |
734 | 0 | in1in2 = _mm_xor_si128(in1in2, next12); |
735 | | |
736 | | /* |
737 | | in1 ^=next1; |
738 | | in2 ^=next2; |
739 | | */ |
740 | |
|
741 | 0 | NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4); |
742 | | /* |
743 | | a1 = (in1 << 17) ^ (in1 << 55); |
744 | | a2 = (in1 >> 47) ^ (in1 >> 9) ^ (in1 << 19); |
745 | | a3 = (in1 >> 45) ^ (in1 << 44); |
746 | | a4 = (in1 >> 20); |
747 | | |
748 | | b1 = (in2 << 17) ^ (in2 << 55); |
749 | | b2 = (in2 >> 47) ^ (in2 >> 9) ^ (in2 << 19); |
750 | | b3 = (in2 >> 45) ^ (in2 << 44); |
751 | | b4 = (in2 >> 20); |
752 | | */ |
753 | | |
754 | | /* |
755 | | in3 = input[i / sizeof(uint64_t) + 2]; |
756 | | in4 = input[i / sizeof(uint64_t) + 3]; |
757 | | in3 ^= next3 ^ a1; |
758 | | in4 ^= next4 ^ a2 ^ b1; |
759 | | |
760 | | c1 = (in3 << 17) ^ (in3 << 55); |
761 | | c2 = (in3 >> 47) ^ (in3 >> 9) ^ (in3 << 19); |
762 | | c3 = (in3 >> 45) ^ (in3 << 44); |
763 | | c4 = (in3 >> 20); |
764 | | |
765 | | d1 = (in4 << 17) ^ (in4 << 55); |
766 | | d2 = (in4 >> 47) ^ (in4 >> 9) ^ (in4 << 19); |
767 | | d3 = (in4 >> 45) ^ (in4 << 44); |
768 | | d4 = (in4 >> 20); |
769 | | */ |
770 | |
|
771 | 0 | __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2); |
772 | 0 | __m128i ab1_next34 = _mm_xor_si128(next34, ab1); |
773 | 0 | in3in4 = _mm_xor_si128(in3in4, ab1_next34); |
774 | 0 | in3in4 = _mm_xor_si128(a2_, in3in4); |
775 | 0 | NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4); |
776 | | |
777 | | /* |
778 | | |
779 | | out1 = a3 ^ b2 ^ c1; |
780 | | out2 = a4 ^ b3 ^ c2 ^ d1; |
781 | | out3 = b4 ^ c3 ^ d2; |
782 | | out4 = c4 ^ d3; |
783 | | out5 = d4; |
784 | | |
785 | | next1 = next5 ^ out1; |
786 | | next2 = out2; |
787 | | next3 = out3; |
788 | | next4 = out4; |
789 | | next5 = out5; |
790 | | */ |
791 | |
|
792 | 0 | __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1)); |
793 | 0 | __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4); |
794 | 0 | a4_ = _mm_xor_si128(b2c2, a4_); |
795 | 0 | next12 = _mm_xor_si128(ab3, a4_); |
796 | 0 | next12 = _mm_xor_si128(next12, cd1); |
797 | |
|
798 | 0 | __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128()); |
799 | 0 | __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1)); |
800 | 0 | next12 = _mm_xor_si128(next12, next56); |
801 | 0 | next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_)); |
802 | 0 | next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128()); |
803 | 0 | } |
804 | |
|
805 | 0 | next1 = _mm_cvtsi128_si64(next12); |
806 | 0 | next2 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next12, next12)); |
807 | 0 | next3 = _mm_cvtsi128_si64(next34); |
808 | 0 | next4 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(next34, next34)); |
809 | 0 | next5 = _mm_cvtsi128_si64(next56); |
810 | | |
811 | | /* Skip the call to memcpy */ |
812 | 0 | size_t copy_len = len - i; |
813 | 0 | __m128i *final128 = (__m128i*)final; |
814 | 0 | __m128i *input128 = (__m128i*)(input + i/ sizeof(uint64_t)); |
815 | 0 | while (copy_len >= 64) { |
816 | 0 | _mm_store_si128(final128++, _mm_load_si128(input128++)); |
817 | 0 | _mm_store_si128(final128++, _mm_load_si128(input128++)); |
818 | 0 | _mm_store_si128(final128++, _mm_load_si128(input128++)); |
819 | 0 | _mm_store_si128(final128++, _mm_load_si128(input128++)); |
820 | 0 | copy_len -= 64; |
821 | 0 | } |
822 | |
|
823 | 0 | while (copy_len >= 16) { |
824 | 0 | _mm_store_si128(final128++, _mm_load_si128(input128++)); |
825 | 0 | copy_len -= 16; |
826 | 0 | } |
827 | |
|
828 | 0 | uint8_t *src_bytes = (uint8_t*)input128; |
829 | 0 | uint8_t *dst_bytes = (uint8_t*)final128; |
830 | 0 | while (copy_len--) { |
831 | 0 | *dst_bytes++ = *src_bytes++; |
832 | 0 | } |
833 | |
|
834 | 0 | final[0] ^= next1; |
835 | 0 | final[1] ^= next2; |
836 | 0 | final[2] ^= next3; |
837 | 0 | final[3] ^= next4; |
838 | 0 | final[4] ^= next5; |
839 | | |
840 | | /* We perform the same loop that braid_internal is doing but we'll skip |
841 | | * the function call for this tiny tail */ |
842 | 0 | uint8_t *final_bytes = (uint8_t*)final; |
843 | 0 | size_t rem = len - i; |
844 | |
|
845 | 0 | while (rem--) { |
846 | 0 | crc = crc_table[(crc ^ *final_bytes++) & 0xff] ^ (crc >> 8); |
847 | 0 | } |
848 | |
|
849 | 0 | return crc; |
850 | 0 | } |
851 | | |
852 | 0 | Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) { |
853 | 0 | uint32_t c; |
854 | 0 | uint64_t* aligned_buf; |
855 | 0 | size_t aligned_len; |
856 | |
|
857 | 0 | c = (~crc) & 0xffffffff; |
858 | 0 | unsigned long algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15; |
859 | 0 | if (algn_diff < len) { |
860 | 0 | if (algn_diff) { |
861 | 0 | c = crc32_braid_internal(c, buf, algn_diff); |
862 | 0 | } |
863 | 0 | aligned_buf = (uint64_t*) (buf + algn_diff); |
864 | 0 | aligned_len = len - algn_diff; |
865 | 0 | if(aligned_len > CHORBA_LARGE_THRESHOLD) { |
866 | 0 | c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); |
867 | 0 | } else if (aligned_len > 72) { |
868 | 0 | c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len); |
869 | 0 | } else { |
870 | 0 | c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); |
871 | 0 | } |
872 | 0 | } |
873 | 0 | else { |
874 | 0 | c = crc32_braid_internal(c, buf, len); |
875 | 0 | } |
876 | | |
877 | | /* Return the CRC, post-conditioned. */ |
878 | 0 | return c ^ 0xffffffff; |
879 | 0 | } |
880 | | #endif |