Coverage Report

Created: 2024-05-20 07:14

/src/skia/third_party/externals/zlib/crc32_simd.c
Line
Count
Source
1
/* crc32_simd.c
2
 *
3
 * Copyright 2017 The Chromium Authors
4
 * Use of this source code is governed by a BSD-style license that can be
5
 * found in the Chromium source repository LICENSE file.
6
 */
7
8
#include "crc32_simd.h"
9
#if defined(CRC32_SIMD_AVX512_PCLMUL)
10
11
/*
12
 * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
13
 * length must be at least 256, and a multiple of 64. Based on:
14
 *
15
 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
16
 *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
17
 */
18
19
#include <emmintrin.h>
20
#include <smmintrin.h>
21
#include <wmmintrin.h>
22
#include <immintrin.h>
23
24
uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
25
    const unsigned char *buf,
26
    z_size_t len,
27
    uint32_t crc)
28
{
29
    /*
30
     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
31
     * are similar to those given at the end of the paper, and remaining
32
     * constants and CRC32+Barrett polynomials remain unchanged.
33
     *
34
     * Replace the index of x from 128 to 512. As follows:
35
     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
36
     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
37
     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
38
     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
39
     */
40
    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
41
                                                0x011542778a, 0x01322d1430,
42
                                                0x011542778a, 0x01322d1430,
43
                                                0x011542778a, 0x01322d1430 };
44
    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
45
                                                0x0154442bd4, 0x01c6e41596,
46
                                                0x0154442bd4, 0x01c6e41596,
47
                                                0x0154442bd4, 0x01c6e41596 };
48
    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
49
    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
50
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
51
    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
52
    __m128i a0, a1, a2, a3;
53
54
    /*
55
     * There's at least one block of 256.
56
     */
57
    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
58
    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
59
    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
60
    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
61
62
    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
63
64
    x0 = _mm512_load_si512((__m512i *)k1k2);
65
66
    buf += 256;
67
    len -= 256;
68
69
    /*
70
     * Parallel fold blocks of 256, if any.
71
     */
72
    while (len >= 256)
73
    {
74
        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
75
        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
76
        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
77
        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
78
79
80
        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
81
        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
82
        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
83
        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
84
85
        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
86
        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
87
        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
88
        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
89
90
        x1 = _mm512_xor_si512(x1, x5);
91
        x2 = _mm512_xor_si512(x2, x6);
92
        x3 = _mm512_xor_si512(x3, x7);
93
        x4 = _mm512_xor_si512(x4, x8);
94
95
        x1 = _mm512_xor_si512(x1, y5);
96
        x2 = _mm512_xor_si512(x2, y6);
97
        x3 = _mm512_xor_si512(x3, y7);
98
        x4 = _mm512_xor_si512(x4, y8);
99
100
        buf += 256;
101
        len -= 256;
102
    }
103
104
    /*
105
     * Fold into 512-bits.
106
     */
107
    x0 = _mm512_load_si512((__m512i *)k3k4);
108
109
    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
110
    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
111
    x1 = _mm512_xor_si512(x1, x2);
112
    x1 = _mm512_xor_si512(x1, x5);
113
114
    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
115
    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
116
    x1 = _mm512_xor_si512(x1, x3);
117
    x1 = _mm512_xor_si512(x1, x5);
118
119
    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
120
    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
121
    x1 = _mm512_xor_si512(x1, x4);
122
    x1 = _mm512_xor_si512(x1, x5);
123
124
    /*
125
     * Single fold blocks of 64, if any.
126
     */
127
    while (len >= 64)
128
    {
129
        x2 = _mm512_loadu_si512((__m512i *)buf);
130
131
        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
132
        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
133
        x1 = _mm512_xor_si512(x1, x2);
134
        x1 = _mm512_xor_si512(x1, x5);
135
136
        buf += 64;
137
        len -= 64;
138
    }
139
140
    /*
141
     * Fold 512-bits to 384-bits.
142
     */
143
    a0 = _mm_load_si128((__m128i *)k5k6);
144
145
    a1 = _mm512_extracti32x4_epi32(x1, 0);
146
    a2 = _mm512_extracti32x4_epi32(x1, 1);
147
148
    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
149
    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
150
151
    a1 = _mm_xor_si128(a1, a3);
152
    a1 = _mm_xor_si128(a1, a2);
153
154
    /*
155
     * Fold 384-bits to 256-bits.
156
     */
157
    a2 = _mm512_extracti32x4_epi32(x1, 2);
158
    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
159
    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
160
    a1 = _mm_xor_si128(a1, a3);
161
    a1 = _mm_xor_si128(a1, a2);
162
163
    /*
164
     * Fold 256-bits to 128-bits.
165
     */
166
    a2 = _mm512_extracti32x4_epi32(x1, 3);
167
    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
168
    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
169
    a1 = _mm_xor_si128(a1, a3);
170
    a1 = _mm_xor_si128(a1, a2);
171
172
    /*
173
     * Fold 128-bits to 64-bits.
174
     */
175
    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
176
    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
177
    a1 = _mm_srli_si128(a1, 8);
178
    a1 = _mm_xor_si128(a1, a2);
179
180
    a0 = _mm_loadl_epi64((__m128i*)k7k8);
181
    a2 = _mm_srli_si128(a1, 4);
182
    a1 = _mm_and_si128(a1, a3);
183
    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
184
    a1 = _mm_xor_si128(a1, a2);
185
186
    /*
187
     * Barret reduce to 32-bits.
188
     */
189
    a0 = _mm_load_si128((__m128i*)poly);
190
191
    a2 = _mm_and_si128(a1, a3);
192
    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
193
    a2 = _mm_and_si128(a2, a3);
194
    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
195
    a1 = _mm_xor_si128(a1, a2);
196
197
    /*
198
     * Return the crc32.
199
     */
200
    return _mm_extract_epi32(a1, 1);
201
}
202
203
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
204
205
/*
206
 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
207
 * length must be at least 64, and a multiple of 16.
208
 */
209
210
#include <emmintrin.h>
211
#include <smmintrin.h>
212
#include <wmmintrin.h>
213
214
uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
215
    const unsigned char *buf,
216
    z_size_t len,
217
    uint32_t crc)
218
90.3k
{
219
    /*
220
     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
221
     * the CRC32+Barrett polynomials given at the end of the paper.
222
     */
223
90.3k
    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
224
90.3k
    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
225
90.3k
    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
226
90.3k
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
227
228
90.3k
    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
229
230
    /*
231
     * There's at least one block of 64.
232
     */
233
90.3k
    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
234
90.3k
    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
235
90.3k
    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
236
90.3k
    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
237
238
90.3k
    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
239
240
90.3k
    x0 = _mm_load_si128((__m128i *)k1k2);
241
242
90.3k
    buf += 64;
243
90.3k
    len -= 64;
244
245
    /*
246
     * Parallel fold blocks of 64, if any.
247
     */
248
999k
    while (len >= 64)
249
908k
    {
250
908k
        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
251
908k
        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
252
908k
        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
253
908k
        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
254
255
908k
        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
256
908k
        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
257
908k
        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
258
908k
        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
259
260
908k
        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
261
908k
        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
262
908k
        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
263
908k
        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
264
265
908k
        x1 = _mm_xor_si128(x1, x5);
266
908k
        x2 = _mm_xor_si128(x2, x6);
267
908k
        x3 = _mm_xor_si128(x3, x7);
268
908k
        x4 = _mm_xor_si128(x4, x8);
269
270
908k
        x1 = _mm_xor_si128(x1, y5);
271
908k
        x2 = _mm_xor_si128(x2, y6);
272
908k
        x3 = _mm_xor_si128(x3, y7);
273
908k
        x4 = _mm_xor_si128(x4, y8);
274
275
908k
        buf += 64;
276
908k
        len -= 64;
277
908k
    }
278
279
    /*
280
     * Fold into 128-bits.
281
     */
282
90.3k
    x0 = _mm_load_si128((__m128i *)k3k4);
283
284
90.3k
    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
285
90.3k
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
286
90.3k
    x1 = _mm_xor_si128(x1, x2);
287
90.3k
    x1 = _mm_xor_si128(x1, x5);
288
289
90.3k
    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
290
90.3k
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
291
90.3k
    x1 = _mm_xor_si128(x1, x3);
292
90.3k
    x1 = _mm_xor_si128(x1, x5);
293
294
90.3k
    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
295
90.3k
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
296
90.3k
    x1 = _mm_xor_si128(x1, x4);
297
90.3k
    x1 = _mm_xor_si128(x1, x5);
298
299
    /*
300
     * Single fold blocks of 16, if any.
301
     */
302
173k
    while (len >= 16)
303
82.9k
    {
304
82.9k
        x2 = _mm_loadu_si128((__m128i *)buf);
305
306
82.9k
        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
307
82.9k
        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
308
82.9k
        x1 = _mm_xor_si128(x1, x2);
309
82.9k
        x1 = _mm_xor_si128(x1, x5);
310
311
82.9k
        buf += 16;
312
82.9k
        len -= 16;
313
82.9k
    }
314
315
    /*
316
     * Fold 128-bits to 64-bits.
317
     */
318
90.3k
    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
319
90.3k
    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
320
90.3k
    x1 = _mm_srli_si128(x1, 8);
321
90.3k
    x1 = _mm_xor_si128(x1, x2);
322
323
90.3k
    x0 = _mm_loadl_epi64((__m128i*)k5k0);
324
325
90.3k
    x2 = _mm_srli_si128(x1, 4);
326
90.3k
    x1 = _mm_and_si128(x1, x3);
327
90.3k
    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
328
90.3k
    x1 = _mm_xor_si128(x1, x2);
329
330
    /*
331
     * Barret reduce to 32-bits.
332
     */
333
90.3k
    x0 = _mm_load_si128((__m128i*)poly);
334
335
90.3k
    x2 = _mm_and_si128(x1, x3);
336
90.3k
    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
337
90.3k
    x2 = _mm_and_si128(x2, x3);
338
90.3k
    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
339
90.3k
    x1 = _mm_xor_si128(x1, x2);
340
341
    /*
342
     * Return the crc32.
343
     */
344
90.3k
    return _mm_extract_epi32(x1, 1);
345
90.3k
}
346
347
#elif defined(CRC32_ARMV8_CRC32)
348
349
/* CRC32 checksums using ARMv8-a crypto instructions.
350
 */
351
352
#if defined(__clang__)
353
/* We need some extra types for using PMULL.
354
 */
355
#if defined(__aarch64__)
356
#include <arm_neon.h>
357
#include <arm_acle.h>
358
#endif
359
360
/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
361
 * armv8 target, which is incompatible with ThinLTO optimizations on Android.
362
 * (Namely, mixing and matching different module-level targets makes ThinLTO
363
 * warn, and Android defaults to armv7-a. This restriction does not apply to
364
 * function-level `target`s, however.)
365
 *
366
 * Since we only need four crc intrinsics, and since clang's implementation of
367
 * those are just wrappers around compiler builtins, it's simplest to #define
368
 * those builtins directly. If this #define list grows too much (or we depend on
369
 * an intrinsic that isn't a trivial wrapper), we may have to find a better way
370
 * to go about this.
371
 *
372
 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
373
 * feature for this target (ignoring feature)." This appears to be a harmless
374
 * bug in clang.
375
 *
376
 * These definitions must appear *after* including arm_acle.h otherwise that
377
 * header may end up defining functions named __builtin_arm_crc32* that call
378
 * themselves, creating an infinite loop when the intrinsic is called.
379
 */
380
/* XXX: Cannot hook into builtins with XCode for arm64. */
381
#if !defined(ARMV8_OS_MACOS)
382
#define __crc32b __builtin_arm_crc32b
383
#define __crc32d __builtin_arm_crc32d
384
#define __crc32w __builtin_arm_crc32w
385
#define __crc32cw __builtin_arm_crc32cw
386
#endif
387
388
#if defined(__aarch64__)
389
#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
390
#else  // !defined(__aarch64__)
391
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
392
#endif  // defined(__aarch64__)
393
394
#elif defined(__GNUC__)
395
/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
396
 * allowed. We can just include arm_acle.h.
397
 */
398
#include <arm_acle.h>
399
#include <arm_neon.h>
400
#define TARGET_ARMV8_WITH_CRC
401
#else  // !defined(__GNUC__) && !defined(_aarch64__)
402
#error ARM CRC32 SIMD extensions only supported for Clang and GCC
403
#endif
404
405
TARGET_ARMV8_WITH_CRC
406
uint32_t ZLIB_INTERNAL armv8_crc32_little(
407
    const unsigned char *buf,
408
    z_size_t len,
409
    uint32_t crc)
410
{
411
    uint32_t c = (uint32_t) ~crc;
412
413
    while (len && ((uintptr_t)buf & 7)) {
414
        c = __crc32b(c, *buf++);
415
        --len;
416
    }
417
418
    const uint64_t *buf8 = (const uint64_t *)buf;
419
420
    while (len >= 64) {
421
        c = __crc32d(c, *buf8++);
422
        c = __crc32d(c, *buf8++);
423
        c = __crc32d(c, *buf8++);
424
        c = __crc32d(c, *buf8++);
425
426
        c = __crc32d(c, *buf8++);
427
        c = __crc32d(c, *buf8++);
428
        c = __crc32d(c, *buf8++);
429
        c = __crc32d(c, *buf8++);
430
        len -= 64;
431
    }
432
433
    while (len >= 8) {
434
        c = __crc32d(c, *buf8++);
435
        len -= 8;
436
    }
437
438
    buf = (const unsigned char *)buf8;
439
440
    while (len--) {
441
        c = __crc32b(c, *buf++);
442
    }
443
444
    return ~c;
445
}
446
447
#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
448
449
/*
450
 * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
451
 * length must be at least 64, and a multiple of 16. Based on:
452
 *
453
 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
454
 *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
455
 */
456
TARGET_ARMV8_WITH_CRC
457
static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
458
{
459
    uint8x16_t r;
460
    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
461
        : "=w" (r) : "w" (a), "w" (b) );
462
    return r;
463
}
464
465
TARGET_ARMV8_WITH_CRC
466
static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
467
{
468
    uint8x16_t r;
469
    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
470
        : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
471
    return r;
472
}
473
474
TARGET_ARMV8_WITH_CRC
475
static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
476
{
477
    uint8x16_t r;
478
    __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
479
        : "=w" (r) : "w" (a), "w" (b) );
480
    return r;
481
}
482
483
TARGET_ARMV8_WITH_CRC
484
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
485
    const unsigned char *buf,
486
    z_size_t len,
487
    uint32_t crc)
488
{
489
    /*
490
     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
491
     * the CRC32+Barrett polynomials given at the end of the paper.
492
     */
493
    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
494
    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
495
    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
496
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
497
498
    uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
499
500
    /*
501
     * There's at least one block of 64.
502
     */
503
    x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
504
    x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
505
    x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
506
    x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
507
508
    x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
509
510
    x0 = vld1q_u64(k1k2);
511
512
    buf += 64;
513
    len -= 64;
514
515
    /*
516
     * Parallel fold blocks of 64, if any.
517
     */
518
    while (len >= 64)
519
    {
520
        x5 = (uint64x2_t) pmull_lo(x1, x0);
521
        x6 = (uint64x2_t) pmull_lo(x2, x0);
522
        x7 = (uint64x2_t) pmull_lo(x3, x0);
523
        x8 = (uint64x2_t) pmull_lo(x4, x0);
524
525
        y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
526
        y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
527
        y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
528
        y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
529
530
        x1 = (uint64x2_t) pmull_hi(x1, x0);
531
        x2 = (uint64x2_t) pmull_hi(x2, x0);
532
        x3 = (uint64x2_t) pmull_hi(x3, x0);
533
        x4 = (uint64x2_t) pmull_hi(x4, x0);
534
535
        x1 = veorq_u64(x1, x5);
536
        x2 = veorq_u64(x2, x6);
537
        x3 = veorq_u64(x3, x7);
538
        x4 = veorq_u64(x4, x8);
539
540
        x1 = veorq_u64(x1, y5);
541
        x2 = veorq_u64(x2, y6);
542
        x3 = veorq_u64(x3, y7);
543
        x4 = veorq_u64(x4, y8);
544
545
        buf += 64;
546
        len -= 64;
547
    }
548
549
    /*
550
     * Fold into 128-bits.
551
     */
552
    x0 = vld1q_u64(k3k4);
553
554
    x5 = (uint64x2_t) pmull_lo(x1, x0);
555
    x1 = (uint64x2_t) pmull_hi(x1, x0);
556
    x1 = veorq_u64(x1, x2);
557
    x1 = veorq_u64(x1, x5);
558
559
    x5 = (uint64x2_t) pmull_lo(x1, x0);
560
    x1 = (uint64x2_t) pmull_hi(x1, x0);
561
    x1 = veorq_u64(x1, x3);
562
    x1 = veorq_u64(x1, x5);
563
564
    x5 = (uint64x2_t) pmull_lo(x1, x0);
565
    x1 = (uint64x2_t) pmull_hi(x1, x0);
566
    x1 = veorq_u64(x1, x4);
567
    x1 = veorq_u64(x1, x5);
568
569
    /*
570
     * Single fold blocks of 16, if any.
571
     */
572
    while (len >= 16)
573
    {
574
        x2 = vld1q_u64((const uint64_t *)buf);
575
576
        x5 = (uint64x2_t) pmull_lo(x1, x0);
577
        x1 = (uint64x2_t) pmull_hi(x1, x0);
578
        x1 = veorq_u64(x1, x2);
579
        x1 = veorq_u64(x1, x5);
580
581
        buf += 16;
582
        len -= 16;
583
    }
584
585
    /*
586
     * Fold 128-bits to 64-bits.
587
     */
588
    static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
589
590
    x2 = (uint64x2_t) pmull_01(x1, x0);
591
    x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
592
    x3 = (uint64x2_t) vld1q_u32(mask);
593
    x1 = veorq_u64(x1, x2);
594
595
    x0 = vld1q_u64(k5k0);
596
597
    x2 = (uint64x2_t) pmull_01(x2, x0);
598
    x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
599
    x1 = vandq_u64(x1, x3);
600
    x1 = (uint64x2_t) pmull_lo(x1, x0);
601
    x1 = veorq_u64(x1, x2);
602
603
    /*
604
     * Barret reduce to 32-bits.
605
     */
606
    x0 = vld1q_u64(poly);
607
608
    x2 = vandq_u64(x1, x3);
609
    x2 = (uint64x2_t) pmull_01(x2, x0);
610
    x2 = vandq_u64(x2, x3);
611
    x2 = (uint64x2_t) pmull_lo(x2, x0);
612
    x1 = veorq_u64(x1, x2);
613
614
    /*
615
     * Return the crc32.
616
     */
617
    return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
618
}
619
#endif /* aarch64 specific code. */
620
621
#endif