/src/skia/third_party/externals/zlib/crc32_simd.c
Line | Count | Source |
1 | | /* crc32_simd.c |
2 | | * |
3 | | * Copyright 2017 The Chromium Authors |
4 | | * Use of this source code is governed by a BSD-style license that can be |
5 | | * found in the Chromium source repository LICENSE file. |
6 | | */ |
7 | | |
8 | | #include "crc32_simd.h" |
9 | | #if defined(CRC32_SIMD_AVX512_PCLMUL) |
10 | | |
11 | | /* |
12 | | * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer |
13 | | * length must be at least 256, and a multiple of 64. Based on: |
14 | | * |
15 | | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" |
16 | | * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
17 | | */ |
18 | | |
19 | | #include <emmintrin.h> |
20 | | #include <smmintrin.h> |
21 | | #include <wmmintrin.h> |
22 | | #include <immintrin.h> |
23 | | |
24 | | uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */ |
25 | | const unsigned char *buf, |
26 | | z_size_t len, |
27 | | uint32_t crc) |
28 | | { |
29 | | /* |
30 | | * Definitions of the bit-reflected domain constants k1,k2,k3,k4 |
31 | | * are similar to those given at the end of the paper, and remaining |
32 | | * constants and CRC32+Barrett polynomials remain unchanged. |
33 | | * |
34 | | * Replace the index of x from 128 to 512. As follows: |
35 | | * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a |
36 | | * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430 |
37 | | * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4 |
38 | | * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596 |
39 | | */ |
40 | | static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430, |
41 | | 0x011542778a, 0x01322d1430, |
42 | | 0x011542778a, 0x01322d1430, |
43 | | 0x011542778a, 0x01322d1430 }; |
44 | | static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596, |
45 | | 0x0154442bd4, 0x01c6e41596, |
46 | | 0x0154442bd4, 0x01c6e41596, |
47 | | 0x0154442bd4, 0x01c6e41596 }; |
48 | | static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e }; |
49 | | static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 }; |
50 | | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
51 | | __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
52 | | __m128i a0, a1, a2, a3; |
53 | | |
54 | | /* |
55 | | * There's at least one block of 256. |
56 | | */ |
57 | | x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); |
58 | | x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); |
59 | | x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); |
60 | | x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); |
61 | | |
62 | | x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); |
63 | | |
64 | | x0 = _mm512_load_si512((__m512i *)k1k2); |
65 | | |
66 | | buf += 256; |
67 | | len -= 256; |
68 | | |
69 | | /* |
70 | | * Parallel fold blocks of 256, if any. |
71 | | */ |
72 | | while (len >= 256) |
73 | | { |
74 | | x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
75 | | x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); |
76 | | x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); |
77 | | x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); |
78 | | |
79 | | |
80 | | x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
81 | | x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); |
82 | | x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); |
83 | | x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); |
84 | | |
85 | | y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); |
86 | | y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); |
87 | | y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); |
88 | | y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); |
89 | | |
90 | | x1 = _mm512_xor_si512(x1, x5); |
91 | | x2 = _mm512_xor_si512(x2, x6); |
92 | | x3 = _mm512_xor_si512(x3, x7); |
93 | | x4 = _mm512_xor_si512(x4, x8); |
94 | | |
95 | | x1 = _mm512_xor_si512(x1, y5); |
96 | | x2 = _mm512_xor_si512(x2, y6); |
97 | | x3 = _mm512_xor_si512(x3, y7); |
98 | | x4 = _mm512_xor_si512(x4, y8); |
99 | | |
100 | | buf += 256; |
101 | | len -= 256; |
102 | | } |
103 | | |
104 | | /* |
105 | | * Fold into 512-bits. |
106 | | */ |
107 | | x0 = _mm512_load_si512((__m512i *)k3k4); |
108 | | |
109 | | x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
110 | | x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
111 | | x1 = _mm512_xor_si512(x1, x2); |
112 | | x1 = _mm512_xor_si512(x1, x5); |
113 | | |
114 | | x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
115 | | x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
116 | | x1 = _mm512_xor_si512(x1, x3); |
117 | | x1 = _mm512_xor_si512(x1, x5); |
118 | | |
119 | | x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
120 | | x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
121 | | x1 = _mm512_xor_si512(x1, x4); |
122 | | x1 = _mm512_xor_si512(x1, x5); |
123 | | |
124 | | /* |
125 | | * Single fold blocks of 64, if any. |
126 | | */ |
127 | | while (len >= 64) |
128 | | { |
129 | | x2 = _mm512_loadu_si512((__m512i *)buf); |
130 | | |
131 | | x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
132 | | x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
133 | | x1 = _mm512_xor_si512(x1, x2); |
134 | | x1 = _mm512_xor_si512(x1, x5); |
135 | | |
136 | | buf += 64; |
137 | | len -= 64; |
138 | | } |
139 | | |
140 | | /* |
141 | | * Fold 512-bits to 384-bits. |
142 | | */ |
143 | | a0 = _mm_load_si128((__m128i *)k5k6); |
144 | | |
145 | | a1 = _mm512_extracti32x4_epi32(x1, 0); |
146 | | a2 = _mm512_extracti32x4_epi32(x1, 1); |
147 | | |
148 | | a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
149 | | a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
150 | | |
151 | | a1 = _mm_xor_si128(a1, a3); |
152 | | a1 = _mm_xor_si128(a1, a2); |
153 | | |
154 | | /* |
155 | | * Fold 384-bits to 256-bits. |
156 | | */ |
157 | | a2 = _mm512_extracti32x4_epi32(x1, 2); |
158 | | a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
159 | | a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
160 | | a1 = _mm_xor_si128(a1, a3); |
161 | | a1 = _mm_xor_si128(a1, a2); |
162 | | |
163 | | /* |
164 | | * Fold 256-bits to 128-bits. |
165 | | */ |
166 | | a2 = _mm512_extracti32x4_epi32(x1, 3); |
167 | | a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
168 | | a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
169 | | a1 = _mm_xor_si128(a1, a3); |
170 | | a1 = _mm_xor_si128(a1, a2); |
171 | | |
172 | | /* |
173 | | * Fold 128-bits to 64-bits. |
174 | | */ |
175 | | a2 = _mm_clmulepi64_si128(a1, a0, 0x10); |
176 | | a3 = _mm_setr_epi32(~0, 0, ~0, 0); |
177 | | a1 = _mm_srli_si128(a1, 8); |
178 | | a1 = _mm_xor_si128(a1, a2); |
179 | | |
180 | | a0 = _mm_loadl_epi64((__m128i*)k7k8); |
181 | | a2 = _mm_srli_si128(a1, 4); |
182 | | a1 = _mm_and_si128(a1, a3); |
183 | | a1 = _mm_clmulepi64_si128(a1, a0, 0x00); |
184 | | a1 = _mm_xor_si128(a1, a2); |
185 | | |
186 | | /* |
187 | | * Barret reduce to 32-bits. |
188 | | */ |
189 | | a0 = _mm_load_si128((__m128i*)poly); |
190 | | |
191 | | a2 = _mm_and_si128(a1, a3); |
192 | | a2 = _mm_clmulepi64_si128(a2, a0, 0x10); |
193 | | a2 = _mm_and_si128(a2, a3); |
194 | | a2 = _mm_clmulepi64_si128(a2, a0, 0x00); |
195 | | a1 = _mm_xor_si128(a1, a2); |
196 | | |
197 | | /* |
198 | | * Return the crc32. |
199 | | */ |
200 | | return _mm_extract_epi32(a1, 1); |
201 | | } |
202 | | |
203 | | #elif defined(CRC32_SIMD_SSE42_PCLMUL) |
204 | | |
205 | | /* |
206 | | * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer |
207 | | * length must be at least 64, and a multiple of 16. |
208 | | */ |
209 | | |
210 | | #include <emmintrin.h> |
211 | | #include <smmintrin.h> |
212 | | #include <wmmintrin.h> |
213 | | |
214 | | uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ |
215 | | const unsigned char *buf, |
216 | | z_size_t len, |
217 | | uint32_t crc) |
218 | 90.3k | { |
219 | | /* |
220 | | * Definitions of the bit-reflected domain constants k1,k2,k3, etc and |
221 | | * the CRC32+Barrett polynomials given at the end of the paper. |
222 | | */ |
223 | 90.3k | static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; |
224 | 90.3k | static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; |
225 | 90.3k | static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; |
226 | 90.3k | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
227 | | |
228 | 90.3k | __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
229 | | |
230 | | /* |
231 | | * There's at least one block of 64. |
232 | | */ |
233 | 90.3k | x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
234 | 90.3k | x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
235 | 90.3k | x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
236 | 90.3k | x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
237 | | |
238 | 90.3k | x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); |
239 | | |
240 | 90.3k | x0 = _mm_load_si128((__m128i *)k1k2); |
241 | | |
242 | 90.3k | buf += 64; |
243 | 90.3k | len -= 64; |
244 | | |
245 | | /* |
246 | | * Parallel fold blocks of 64, if any. |
247 | | */ |
248 | 999k | while (len >= 64) |
249 | 908k | { |
250 | 908k | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
251 | 908k | x6 = _mm_clmulepi64_si128(x2, x0, 0x00); |
252 | 908k | x7 = _mm_clmulepi64_si128(x3, x0, 0x00); |
253 | 908k | x8 = _mm_clmulepi64_si128(x4, x0, 0x00); |
254 | | |
255 | 908k | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
256 | 908k | x2 = _mm_clmulepi64_si128(x2, x0, 0x11); |
257 | 908k | x3 = _mm_clmulepi64_si128(x3, x0, 0x11); |
258 | 908k | x4 = _mm_clmulepi64_si128(x4, x0, 0x11); |
259 | | |
260 | 908k | y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
261 | 908k | y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
262 | 908k | y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
263 | 908k | y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
264 | | |
265 | 908k | x1 = _mm_xor_si128(x1, x5); |
266 | 908k | x2 = _mm_xor_si128(x2, x6); |
267 | 908k | x3 = _mm_xor_si128(x3, x7); |
268 | 908k | x4 = _mm_xor_si128(x4, x8); |
269 | | |
270 | 908k | x1 = _mm_xor_si128(x1, y5); |
271 | 908k | x2 = _mm_xor_si128(x2, y6); |
272 | 908k | x3 = _mm_xor_si128(x3, y7); |
273 | 908k | x4 = _mm_xor_si128(x4, y8); |
274 | | |
275 | 908k | buf += 64; |
276 | 908k | len -= 64; |
277 | 908k | } |
278 | | |
279 | | /* |
280 | | * Fold into 128-bits. |
281 | | */ |
282 | 90.3k | x0 = _mm_load_si128((__m128i *)k3k4); |
283 | | |
284 | 90.3k | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
285 | 90.3k | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
286 | 90.3k | x1 = _mm_xor_si128(x1, x2); |
287 | 90.3k | x1 = _mm_xor_si128(x1, x5); |
288 | | |
289 | 90.3k | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
290 | 90.3k | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
291 | 90.3k | x1 = _mm_xor_si128(x1, x3); |
292 | 90.3k | x1 = _mm_xor_si128(x1, x5); |
293 | | |
294 | 90.3k | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
295 | 90.3k | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
296 | 90.3k | x1 = _mm_xor_si128(x1, x4); |
297 | 90.3k | x1 = _mm_xor_si128(x1, x5); |
298 | | |
299 | | /* |
300 | | * Single fold blocks of 16, if any. |
301 | | */ |
302 | 173k | while (len >= 16) |
303 | 82.9k | { |
304 | 82.9k | x2 = _mm_loadu_si128((__m128i *)buf); |
305 | | |
306 | 82.9k | x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
307 | 82.9k | x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
308 | 82.9k | x1 = _mm_xor_si128(x1, x2); |
309 | 82.9k | x1 = _mm_xor_si128(x1, x5); |
310 | | |
311 | 82.9k | buf += 16; |
312 | 82.9k | len -= 16; |
313 | 82.9k | } |
314 | | |
315 | | /* |
316 | | * Fold 128-bits to 64-bits. |
317 | | */ |
318 | 90.3k | x2 = _mm_clmulepi64_si128(x1, x0, 0x10); |
319 | 90.3k | x3 = _mm_setr_epi32(~0, 0, ~0, 0); |
320 | 90.3k | x1 = _mm_srli_si128(x1, 8); |
321 | 90.3k | x1 = _mm_xor_si128(x1, x2); |
322 | | |
323 | 90.3k | x0 = _mm_loadl_epi64((__m128i*)k5k0); |
324 | | |
325 | 90.3k | x2 = _mm_srli_si128(x1, 4); |
326 | 90.3k | x1 = _mm_and_si128(x1, x3); |
327 | 90.3k | x1 = _mm_clmulepi64_si128(x1, x0, 0x00); |
328 | 90.3k | x1 = _mm_xor_si128(x1, x2); |
329 | | |
330 | | /* |
331 | | * Barret reduce to 32-bits. |
332 | | */ |
333 | 90.3k | x0 = _mm_load_si128((__m128i*)poly); |
334 | | |
335 | 90.3k | x2 = _mm_and_si128(x1, x3); |
336 | 90.3k | x2 = _mm_clmulepi64_si128(x2, x0, 0x10); |
337 | 90.3k | x2 = _mm_and_si128(x2, x3); |
338 | 90.3k | x2 = _mm_clmulepi64_si128(x2, x0, 0x00); |
339 | 90.3k | x1 = _mm_xor_si128(x1, x2); |
340 | | |
341 | | /* |
342 | | * Return the crc32. |
343 | | */ |
344 | 90.3k | return _mm_extract_epi32(x1, 1); |
345 | 90.3k | } |
346 | | |
347 | | #elif defined(CRC32_ARMV8_CRC32) |
348 | | |
349 | | /* CRC32 checksums using ARMv8-a crypto instructions. |
350 | | */ |
351 | | |
352 | | #if defined(__clang__) |
353 | | /* We need some extra types for using PMULL. |
354 | | */ |
355 | | #if defined(__aarch64__) |
356 | | #include <arm_neon.h> |
357 | | #include <arm_acle.h> |
358 | | #endif |
359 | | |
360 | | /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an |
361 | | * armv8 target, which is incompatible with ThinLTO optimizations on Android. |
362 | | * (Namely, mixing and matching different module-level targets makes ThinLTO |
363 | | * warn, and Android defaults to armv7-a. This restriction does not apply to |
364 | | * function-level `target`s, however.) |
365 | | * |
366 | | * Since we only need four crc intrinsics, and since clang's implementation of |
367 | | * those are just wrappers around compiler builtins, it's simplest to #define |
368 | | * those builtins directly. If this #define list grows too much (or we depend on |
369 | | * an intrinsic that isn't a trivial wrapper), we may have to find a better way |
370 | | * to go about this. |
371 | | * |
372 | | * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized |
373 | | * feature for this target (ignoring feature)." This appears to be a harmless |
374 | | * bug in clang. |
375 | | * |
376 | | * These definitions must appear *after* including arm_acle.h otherwise that |
377 | | * header may end up defining functions named __builtin_arm_crc32* that call |
378 | | * themselves, creating an infinite loop when the intrinsic is called. |
379 | | */ |
380 | | /* XXX: Cannot hook into builtins with XCode for arm64. */ |
381 | | #if !defined(ARMV8_OS_MACOS) |
382 | | #define __crc32b __builtin_arm_crc32b |
383 | | #define __crc32d __builtin_arm_crc32d |
384 | | #define __crc32w __builtin_arm_crc32w |
385 | | #define __crc32cw __builtin_arm_crc32cw |
386 | | #endif |
387 | | |
388 | | #if defined(__aarch64__) |
389 | | #define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc"))) |
390 | | #else // !defined(__aarch64__) |
391 | | #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) |
392 | | #endif // defined(__aarch64__) |
393 | | |
394 | | #elif defined(__GNUC__) |
395 | | /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not |
396 | | * allowed. We can just include arm_acle.h. |
397 | | */ |
398 | | #include <arm_acle.h> |
399 | | #include <arm_neon.h> |
400 | | #define TARGET_ARMV8_WITH_CRC |
401 | | #else // !defined(__GNUC__) && !defined(_aarch64__) |
402 | | #error ARM CRC32 SIMD extensions only supported for Clang and GCC |
403 | | #endif |
404 | | |
405 | | TARGET_ARMV8_WITH_CRC |
406 | | uint32_t ZLIB_INTERNAL armv8_crc32_little( |
407 | | const unsigned char *buf, |
408 | | z_size_t len, |
409 | | uint32_t crc) |
410 | | { |
411 | | uint32_t c = (uint32_t) ~crc; |
412 | | |
413 | | while (len && ((uintptr_t)buf & 7)) { |
414 | | c = __crc32b(c, *buf++); |
415 | | --len; |
416 | | } |
417 | | |
418 | | const uint64_t *buf8 = (const uint64_t *)buf; |
419 | | |
420 | | while (len >= 64) { |
421 | | c = __crc32d(c, *buf8++); |
422 | | c = __crc32d(c, *buf8++); |
423 | | c = __crc32d(c, *buf8++); |
424 | | c = __crc32d(c, *buf8++); |
425 | | |
426 | | c = __crc32d(c, *buf8++); |
427 | | c = __crc32d(c, *buf8++); |
428 | | c = __crc32d(c, *buf8++); |
429 | | c = __crc32d(c, *buf8++); |
430 | | len -= 64; |
431 | | } |
432 | | |
433 | | while (len >= 8) { |
434 | | c = __crc32d(c, *buf8++); |
435 | | len -= 8; |
436 | | } |
437 | | |
438 | | buf = (const unsigned char *)buf8; |
439 | | |
440 | | while (len--) { |
441 | | c = __crc32b(c, *buf++); |
442 | | } |
443 | | |
444 | | return ~c; |
445 | | } |
446 | | |
447 | | #if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */ |
448 | | |
449 | | /* |
450 | | * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer |
451 | | * length must be at least 64, and a multiple of 16. Based on: |
452 | | * |
453 | | * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" |
454 | | * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
455 | | */ |
456 | | TARGET_ARMV8_WITH_CRC |
457 | | static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b) |
458 | | { |
459 | | uint8x16_t r; |
460 | | __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t" |
461 | | : "=w" (r) : "w" (a), "w" (b) ); |
462 | | return r; |
463 | | } |
464 | | |
465 | | TARGET_ARMV8_WITH_CRC |
466 | | static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b) |
467 | | { |
468 | | uint8x16_t r; |
469 | | __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t" |
470 | | : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) ); |
471 | | return r; |
472 | | } |
473 | | |
474 | | TARGET_ARMV8_WITH_CRC |
475 | | static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b) |
476 | | { |
477 | | uint8x16_t r; |
478 | | __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t" |
479 | | : "=w" (r) : "w" (a), "w" (b) ); |
480 | | return r; |
481 | | } |
482 | | |
483 | | TARGET_ARMV8_WITH_CRC |
484 | | uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little( |
485 | | const unsigned char *buf, |
486 | | z_size_t len, |
487 | | uint32_t crc) |
488 | | { |
489 | | /* |
490 | | * Definitions of the bit-reflected domain constants k1,k2,k3, etc and |
491 | | * the CRC32+Barrett polynomials given at the end of the paper. |
492 | | */ |
493 | | static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; |
494 | | static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; |
495 | | static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; |
496 | | static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; |
497 | | |
498 | | uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
499 | | |
500 | | /* |
501 | | * There's at least one block of 64. |
502 | | */ |
503 | | x1 = vld1q_u64((const uint64_t *)(buf + 0x00)); |
504 | | x2 = vld1q_u64((const uint64_t *)(buf + 0x10)); |
505 | | x3 = vld1q_u64((const uint64_t *)(buf + 0x20)); |
506 | | x4 = vld1q_u64((const uint64_t *)(buf + 0x30)); |
507 | | |
508 | | x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0)); |
509 | | |
510 | | x0 = vld1q_u64(k1k2); |
511 | | |
512 | | buf += 64; |
513 | | len -= 64; |
514 | | |
515 | | /* |
516 | | * Parallel fold blocks of 64, if any. |
517 | | */ |
518 | | while (len >= 64) |
519 | | { |
520 | | x5 = (uint64x2_t) pmull_lo(x1, x0); |
521 | | x6 = (uint64x2_t) pmull_lo(x2, x0); |
522 | | x7 = (uint64x2_t) pmull_lo(x3, x0); |
523 | | x8 = (uint64x2_t) pmull_lo(x4, x0); |
524 | | |
525 | | y5 = vld1q_u64((const uint64_t *)(buf + 0x00)); |
526 | | y6 = vld1q_u64((const uint64_t *)(buf + 0x10)); |
527 | | y7 = vld1q_u64((const uint64_t *)(buf + 0x20)); |
528 | | y8 = vld1q_u64((const uint64_t *)(buf + 0x30)); |
529 | | |
530 | | x1 = (uint64x2_t) pmull_hi(x1, x0); |
531 | | x2 = (uint64x2_t) pmull_hi(x2, x0); |
532 | | x3 = (uint64x2_t) pmull_hi(x3, x0); |
533 | | x4 = (uint64x2_t) pmull_hi(x4, x0); |
534 | | |
535 | | x1 = veorq_u64(x1, x5); |
536 | | x2 = veorq_u64(x2, x6); |
537 | | x3 = veorq_u64(x3, x7); |
538 | | x4 = veorq_u64(x4, x8); |
539 | | |
540 | | x1 = veorq_u64(x1, y5); |
541 | | x2 = veorq_u64(x2, y6); |
542 | | x3 = veorq_u64(x3, y7); |
543 | | x4 = veorq_u64(x4, y8); |
544 | | |
545 | | buf += 64; |
546 | | len -= 64; |
547 | | } |
548 | | |
549 | | /* |
550 | | * Fold into 128-bits. |
551 | | */ |
552 | | x0 = vld1q_u64(k3k4); |
553 | | |
554 | | x5 = (uint64x2_t) pmull_lo(x1, x0); |
555 | | x1 = (uint64x2_t) pmull_hi(x1, x0); |
556 | | x1 = veorq_u64(x1, x2); |
557 | | x1 = veorq_u64(x1, x5); |
558 | | |
559 | | x5 = (uint64x2_t) pmull_lo(x1, x0); |
560 | | x1 = (uint64x2_t) pmull_hi(x1, x0); |
561 | | x1 = veorq_u64(x1, x3); |
562 | | x1 = veorq_u64(x1, x5); |
563 | | |
564 | | x5 = (uint64x2_t) pmull_lo(x1, x0); |
565 | | x1 = (uint64x2_t) pmull_hi(x1, x0); |
566 | | x1 = veorq_u64(x1, x4); |
567 | | x1 = veorq_u64(x1, x5); |
568 | | |
569 | | /* |
570 | | * Single fold blocks of 16, if any. |
571 | | */ |
572 | | while (len >= 16) |
573 | | { |
574 | | x2 = vld1q_u64((const uint64_t *)buf); |
575 | | |
576 | | x5 = (uint64x2_t) pmull_lo(x1, x0); |
577 | | x1 = (uint64x2_t) pmull_hi(x1, x0); |
578 | | x1 = veorq_u64(x1, x2); |
579 | | x1 = veorq_u64(x1, x5); |
580 | | |
581 | | buf += 16; |
582 | | len -= 16; |
583 | | } |
584 | | |
585 | | /* |
586 | | * Fold 128-bits to 64-bits. |
587 | | */ |
588 | | static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u }; |
589 | | |
590 | | x2 = (uint64x2_t) pmull_01(x1, x0); |
591 | | x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8); |
592 | | x3 = (uint64x2_t) vld1q_u32(mask); |
593 | | x1 = veorq_u64(x1, x2); |
594 | | |
595 | | x0 = vld1q_u64(k5k0); |
596 | | |
597 | | x2 = (uint64x2_t) pmull_01(x2, x0); |
598 | | x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4); |
599 | | x1 = vandq_u64(x1, x3); |
600 | | x1 = (uint64x2_t) pmull_lo(x1, x0); |
601 | | x1 = veorq_u64(x1, x2); |
602 | | |
603 | | /* |
604 | | * Barret reduce to 32-bits. |
605 | | */ |
606 | | x0 = vld1q_u64(poly); |
607 | | |
608 | | x2 = vandq_u64(x1, x3); |
609 | | x2 = (uint64x2_t) pmull_01(x2, x0); |
610 | | x2 = vandq_u64(x2, x3); |
611 | | x2 = (uint64x2_t) pmull_lo(x2, x0); |
612 | | x1 = veorq_u64(x1, x2); |
613 | | |
614 | | /* |
615 | | * Return the crc32. |
616 | | */ |
617 | | return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1); |
618 | | } |
619 | | #endif /* aarch64 specific code. */ |
620 | | |
621 | | #endif |