/src/skia/third_party/externals/zlib/crc32_simd.c

Source
/* crc32_simd.c
 *
 * Copyright 2017 The Chromium Authors
 * Use of this source code is governed by a BSD-style license that can be
 * found in the Chromium source repository LICENSE file.
 */

#include "crc32_simd.h"
#if defined(CRC32_SIMD_AVX512_PCLMUL)

/*
 * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
 * length must be at least 256, and a multiple of 64. Based on:
 *
 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
 *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
 */

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>

uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
    const unsigned char *buf,
    z_size_t len,
    uint32_t crc)
{
    /*
     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
     * are similar to those given at the end of the paper, and remaining
     * constants and CRC32+Barrett polynomials remain unchanged.
     *
     * Replace the index of x from 128 to 512. As follows:
     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
     */
    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
                                                0x011542778a, 0x01322d1430,
                                                0x011542778a, 0x01322d1430,
                                                0x011542778a, 0x01322d1430 };
    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
                                                0x0154442bd4, 0x01c6e41596,
                                                0x0154442bd4, 0x01c6e41596,
                                                0x0154442bd4, 0x01c6e41596 };
    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
    __m128i a0, a1, a2, a3;

    /*
     * There's at least one block of 256.
     */
    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));

    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));

    x0 = _mm512_load_si512((__m512i *)k1k2);

    buf += 256;
    len -= 256;

    /*
     * Parallel fold blocks of 256, if any.
     */
    while (len >= 256)
    {
        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);


        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);

        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));

        x1 = _mm512_xor_si512(x1, x5);
        x2 = _mm512_xor_si512(x2, x6);
        x3 = _mm512_xor_si512(x3, x7);
        x4 = _mm512_xor_si512(x4, x8);

        x1 = _mm512_xor_si512(x1, y5);
        x2 = _mm512_xor_si512(x2, y6);
        x3 = _mm512_xor_si512(x3, y7);
        x4 = _mm512_xor_si512(x4, y8);

        buf += 256;
        len -= 256;
    }

    /*
     * Fold into 512-bits.
     */
    x0 = _mm512_load_si512((__m512i *)k3k4);

    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
    x1 = _mm512_xor_si512(x1, x2);
    x1 = _mm512_xor_si512(x1, x5);

    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
    x1 = _mm512_xor_si512(x1, x3);
    x1 = _mm512_xor_si512(x1, x5);

    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
    x1 = _mm512_xor_si512(x1, x4);
    x1 = _mm512_xor_si512(x1, x5);

    /*
     * Single fold blocks of 64, if any.
     */
    while (len >= 64)
    {
        x2 = _mm512_loadu_si512((__m512i *)buf);

        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
        x1 = _mm512_xor_si512(x1, x2);
        x1 = _mm512_xor_si512(x1, x5);

        buf += 64;
        len -= 64;
    }

    /*
     * Fold 512-bits to 384-bits.
     */
    a0 = _mm_load_si128((__m128i *)k5k6);

    a1 = _mm512_extracti32x4_epi32(x1, 0);
    a2 = _mm512_extracti32x4_epi32(x1, 1);

    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);

    a1 = _mm_xor_si128(a1, a3);
    a1 = _mm_xor_si128(a1, a2);

    /*
     * Fold 384-bits to 256-bits.
     */
    a2 = _mm512_extracti32x4_epi32(x1, 2);
    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
    a1 = _mm_xor_si128(a1, a3);
    a1 = _mm_xor_si128(a1, a2);

    /*
     * Fold 256-bits to 128-bits.
     */
    a2 = _mm512_extracti32x4_epi32(x1, 3);
    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
    a1 = _mm_xor_si128(a1, a3);
    a1 = _mm_xor_si128(a1, a2);

    /*
     * Fold 128-bits to 64-bits.
     */
    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
    a1 = _mm_srli_si128(a1, 8);
    a1 = _mm_xor_si128(a1, a2);

    a0 = _mm_loadl_epi64((__m128i*)k7k8);
    a2 = _mm_srli_si128(a1, 4);
    a1 = _mm_and_si128(a1, a3);
    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
    a1 = _mm_xor_si128(a1, a2);

    /*
     * Barret reduce to 32-bits.
     */
    a0 = _mm_load_si128((__m128i*)poly);

    a2 = _mm_and_si128(a1, a3);
    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
    a2 = _mm_and_si128(a2, a3);
    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
    a1 = _mm_xor_si128(a1, a2);

    /*
     * Return the crc32.
     */
    return _mm_extract_epi32(a1, 1);
}

#elif defined(CRC32_SIMD_SSE42_PCLMUL)

/*
 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
 * length must be at least 64, and a multiple of 16.
 */

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>

uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
    const unsigned char *buf,
    z_size_t len,
    uint32_t crc)
{
    /*
     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
     * the CRC32+Barrett polynomials given at the end of the paper.
     */
    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };

    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;

    /*
     * There's at least one block of 64.
     */
    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));

    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));

    x0 = _mm_load_si128((__m128i *)k1k2);

    buf += 64;
    len -= 64;

    /*
     * Parallel fold blocks of 64, if any.
     */
    while (len >= 64)
    {
        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);

        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);

        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));

        x1 = _mm_xor_si128(x1, x5);
        x2 = _mm_xor_si128(x2, x6);
        x3 = _mm_xor_si128(x3, x7);
        x4 = _mm_xor_si128(x4, x8);

        x1 = _mm_xor_si128(x1, y5);
        x2 = _mm_xor_si128(x2, y6);
        x3 = _mm_xor_si128(x3, y7);
        x4 = _mm_xor_si128(x4, y8);

        buf += 64;
        len -= 64;
    }

    /*
     * Fold into 128-bits.
     */
    x0 = _mm_load_si128((__m128i *)k3k4);

    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
    x1 = _mm_xor_si128(x1, x2);
    x1 = _mm_xor_si128(x1, x5);

    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
    x1 = _mm_xor_si128(x1, x3);
    x1 = _mm_xor_si128(x1, x5);

    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
    x1 = _mm_xor_si128(x1, x4);
    x1 = _mm_xor_si128(x1, x5);

    /*
     * Single fold blocks of 16, if any.
     */
    while (len >= 16)
    {
        x2 = _mm_loadu_si128((__m128i *)buf);

        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
        x1 = _mm_xor_si128(x1, x2);
        x1 = _mm_xor_si128(x1, x5);

        buf += 16;
        len -= 16;
    }

    /*
     * Fold 128-bits to 64-bits.
     */
    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
    x1 = _mm_srli_si128(x1, 8);
    x1 = _mm_xor_si128(x1, x2);

    x0 = _mm_loadl_epi64((__m128i*)k5k0);

    x2 = _mm_srli_si128(x1, 4);
    x1 = _mm_and_si128(x1, x3);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_xor_si128(x1, x2);

    /*
     * Barret reduce to 32-bits.
     */
    x0 = _mm_load_si128((__m128i*)poly);

    x2 = _mm_and_si128(x1, x3);
    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
    x2 = _mm_and_si128(x2, x3);
    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
    x1 = _mm_xor_si128(x1, x2);

    /*
     * Return the crc32.
     */
    return _mm_extract_epi32(x1, 1);
}

#elif defined(CRC32_ARMV8_CRC32)

/* CRC32 checksums using ARMv8-a crypto instructions.
 */

#if defined(__clang__)
/* We need some extra types for using PMULL.
 */
#if defined(__aarch64__)
#include <arm_neon.h>
#include <arm_acle.h>
#endif

/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
 * armv8 target, which is incompatible with ThinLTO optimizations on Android.
 * (Namely, mixing and matching different module-level targets makes ThinLTO
 * warn, and Android defaults to armv7-a. This restriction does not apply to
 * function-level `target`s, however.)
 *
 * Since we only need four crc intrinsics, and since clang's implementation of
 * those are just wrappers around compiler builtins, it's simplest to #define
 * those builtins directly. If this #define list grows too much (or we depend on
 * an intrinsic that isn't a trivial wrapper), we may have to find a better way
 * to go about this.
 *
 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
 * feature for this target (ignoring feature)." This appears to be a harmless
 * bug in clang.
 *
 * These definitions must appear *after* including arm_acle.h otherwise that
 * header may end up defining functions named __builtin_arm_crc32* that call
 * themselves, creating an infinite loop when the intrinsic is called.
 */
/* XXX: Cannot hook into builtins with XCode for arm64. */
#if !defined(ARMV8_OS_MACOS)
#define __crc32b __builtin_arm_crc32b
#define __crc32d __builtin_arm_crc32d
#define __crc32w __builtin_arm_crc32w
#define __crc32cw __builtin_arm_crc32cw
#endif

#if defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
#else  // !defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
#endif  // defined(__aarch64__)

#elif defined(__GNUC__)
/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
 * allowed. We can just include arm_acle.h.
 */
#include <arm_acle.h>
#include <arm_neon.h>
#define TARGET_ARMV8_WITH_CRC
#else  // !defined(__GNUC__) && !defined(_aarch64__)
#error ARM CRC32 SIMD extensions only supported for Clang and GCC
#endif

TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_little(
    const unsigned char *buf,
    z_size_t len,
    uint32_t crc)
{
    uint32_t c = (uint32_t) ~crc;

    while (len && ((uintptr_t)buf & 7)) {
        c = __crc32b(c, *buf++);
        --len;
    }

    const uint64_t *buf8 = (const uint64_t *)buf;

    while (len >= 64) {
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);

        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        len -= 64;
    }

    while (len >= 8) {
        c = __crc32d(c, *buf8++);
        len -= 8;
    }

    buf = (const unsigned char *)buf8;

    while (len--) {
        c = __crc32b(c, *buf++);
    }

    return ~c;
}

#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */

/*
 * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
 * length must be at least 64, and a multiple of 16. Based on:
 *
 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
 *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
 */
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
{
    uint8x16_t r;
    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
        : "=w" (r) : "w" (a), "w" (b) );
    return r;
}

TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
{
    uint8x16_t r;
    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
        : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
    return r;
}

TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
{
    uint8x16_t r;
    __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
        : "=w" (r) : "w" (a), "w" (b) );
    return r;
}

TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
    const unsigned char *buf,
    z_size_t len,
    uint32_t crc)
{
    /*
     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
     * the CRC32+Barrett polynomials given at the end of the paper.
     */
    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };

    uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;

    /*
     * There's at least one block of 64.
     */
    x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
    x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
    x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
    x4 = vld1q_u64((const uint64_t *)(buf + 0x30));

    x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));

    x0 = vld1q_u64(k1k2);

    buf += 64;
    len -= 64;

    /*
     * Parallel fold blocks of 64, if any.
     */
    while (len >= 64)
    {
        x5 = (uint64x2_t) pmull_lo(x1, x0);
        x6 = (uint64x2_t) pmull_lo(x2, x0);
        x7 = (uint64x2_t) pmull_lo(x3, x0);
        x8 = (uint64x2_t) pmull_lo(x4, x0);

        y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
        y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
        y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
        y8 = vld1q_u64((const uint64_t *)(buf + 0x30));

        x1 = (uint64x2_t) pmull_hi(x1, x0);
        x2 = (uint64x2_t) pmull_hi(x2, x0);
        x3 = (uint64x2_t) pmull_hi(x3, x0);
        x4 = (uint64x2_t) pmull_hi(x4, x0);

        x1 = veorq_u64(x1, x5);
        x2 = veorq_u64(x2, x6);
        x3 = veorq_u64(x3, x7);
        x4 = veorq_u64(x4, x8);

        x1 = veorq_u64(x1, y5);
        x2 = veorq_u64(x2, y6);
        x3 = veorq_u64(x3, y7);
        x4 = veorq_u64(x4, y8);

        buf += 64;
        len -= 64;
    }

    /*
     * Fold into 128-bits.
     */
    x0 = vld1q_u64(k3k4);

    x5 = (uint64x2_t) pmull_lo(x1, x0);
    x1 = (uint64x2_t) pmull_hi(x1, x0);
    x1 = veorq_u64(x1, x2);
    x1 = veorq_u64(x1, x5);

    x5 = (uint64x2_t) pmull_lo(x1, x0);
    x1 = (uint64x2_t) pmull_hi(x1, x0);
    x1 = veorq_u64(x1, x3);
    x1 = veorq_u64(x1, x5);

    x5 = (uint64x2_t) pmull_lo(x1, x0);
    x1 = (uint64x2_t) pmull_hi(x1, x0);
    x1 = veorq_u64(x1, x4);
    x1 = veorq_u64(x1, x5);

    /*
     * Single fold blocks of 16, if any.
     */
    while (len >= 16)
    {
        x2 = vld1q_u64((const uint64_t *)buf);

        x5 = (uint64x2_t) pmull_lo(x1, x0);
        x1 = (uint64x2_t) pmull_hi(x1, x0);
        x1 = veorq_u64(x1, x2);
        x1 = veorq_u64(x1, x5);

        buf += 16;
        len -= 16;
    }

    /*
     * Fold 128-bits to 64-bits.
     */
    static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };

    x2 = (uint64x2_t) pmull_01(x1, x0);
    x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
    x3 = (uint64x2_t) vld1q_u32(mask);
    x1 = veorq_u64(x1, x2);

    x0 = vld1q_u64(k5k0);

    x2 = (uint64x2_t) pmull_01(x2, x0);
    x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
    x1 = vandq_u64(x1, x3);
    x1 = (uint64x2_t) pmull_lo(x1, x0);
    x1 = veorq_u64(x1, x2);

    /*
     * Barret reduce to 32-bits.
     */
    x0 = vld1q_u64(poly);

    x2 = vandq_u64(x1, x3);
    x2 = (uint64x2_t) pmull_01(x2, x0);
    x2 = vandq_u64(x2, x3);
    x2 = (uint64x2_t) pmull_lo(x2, x0);
    x1 = veorq_u64(x1, x2);

    /*
     * Return the crc32.
     */
    return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
}
#endif /* aarch64 specific code. */

#endif

Coverage Report

Created: 2024-05-20 07:14

Line	Count	Source
1		/* crc32_simd.c
2		*
3		* Copyright 2017 The Chromium Authors
4		* Use of this source code is governed by a BSD-style license that can be
5		* found in the Chromium source repository LICENSE file.
6		*/
7
8		#include "crc32_simd.h"
9		#if defined(CRC32_SIMD_AVX512_PCLMUL)
10
11		/*
12		* crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
13		* length must be at least 256, and a multiple of 64. Based on:
14		*
15		* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
16		* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
17		*/
18
19		#include <emmintrin.h>
20		#include <smmintrin.h>
21		#include <wmmintrin.h>
22		#include <immintrin.h>
23
24		uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
25		const unsigned char *buf,
26		z_size_t len,
27		uint32_t crc)
28		{
29		/*
30		* Definitions of the bit-reflected domain constants k1,k2,k3,k4
31		* are similar to those given at the end of the paper, and remaining
32		* constants and CRC32+Barrett polynomials remain unchanged.
33		*
34		* Replace the index of x from 128 to 512. As follows:
35		* k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
36		* k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
37		* k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
38		* k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
39		*/
40		static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
41		0x011542778a, 0x01322d1430,
42		0x011542778a, 0x01322d1430,
43		0x011542778a, 0x01322d1430 };
44		static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
45		0x0154442bd4, 0x01c6e41596,
46		0x0154442bd4, 0x01c6e41596,
47		0x0154442bd4, 0x01c6e41596 };
48		static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
49		static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
50		static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
51		__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
52		__m128i a0, a1, a2, a3;
53
54		/*
55		* There's at least one block of 256.
56		*/
57		x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
58		x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
59		x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
60		x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
61
62		x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
63
64		x0 = _mm512_load_si512((__m512i *)k1k2);
65
66		buf += 256;
67		len -= 256;
68
69		/*
70		* Parallel fold blocks of 256, if any.
71		*/
72		while (len >= 256)
73		{
74		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
75		x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
76		x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
77		x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
78
79
80		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
81		x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
82		x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
83		x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
84
85		y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
86		y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
87		y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
88		y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
89
90		x1 = _mm512_xor_si512(x1, x5);
91		x2 = _mm512_xor_si512(x2, x6);
92		x3 = _mm512_xor_si512(x3, x7);
93		x4 = _mm512_xor_si512(x4, x8);
94
95		x1 = _mm512_xor_si512(x1, y5);
96		x2 = _mm512_xor_si512(x2, y6);
97		x3 = _mm512_xor_si512(x3, y7);
98		x4 = _mm512_xor_si512(x4, y8);
99
100		buf += 256;
101		len -= 256;
102		}
103
104		/*
105		* Fold into 512-bits.
106		*/
107		x0 = _mm512_load_si512((__m512i *)k3k4);
108
109		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
110		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
111		x1 = _mm512_xor_si512(x1, x2);
112		x1 = _mm512_xor_si512(x1, x5);
113
114		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
115		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
116		x1 = _mm512_xor_si512(x1, x3);
117		x1 = _mm512_xor_si512(x1, x5);
118
119		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
120		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
121		x1 = _mm512_xor_si512(x1, x4);
122		x1 = _mm512_xor_si512(x1, x5);
123
124		/*
125		* Single fold blocks of 64, if any.
126		*/
127		while (len >= 64)
128		{
129		x2 = _mm512_loadu_si512((__m512i *)buf);
130
131		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
132		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
133		x1 = _mm512_xor_si512(x1, x2);
134		x1 = _mm512_xor_si512(x1, x5);
135
136		buf += 64;
137		len -= 64;
138		}
139
140		/*
141		* Fold 512-bits to 384-bits.
142		*/
143		a0 = _mm_load_si128((__m128i *)k5k6);
144
145		a1 = _mm512_extracti32x4_epi32(x1, 0);
146		a2 = _mm512_extracti32x4_epi32(x1, 1);
147
148		a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
149		a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
150
151		a1 = _mm_xor_si128(a1, a3);
152		a1 = _mm_xor_si128(a1, a2);
153
154		/*
155		* Fold 384-bits to 256-bits.
156		*/
157		a2 = _mm512_extracti32x4_epi32(x1, 2);
158		a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
159		a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
160		a1 = _mm_xor_si128(a1, a3);
161		a1 = _mm_xor_si128(a1, a2);
162
163		/*
164		* Fold 256-bits to 128-bits.
165		*/
166		a2 = _mm512_extracti32x4_epi32(x1, 3);
167		a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
168		a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
169		a1 = _mm_xor_si128(a1, a3);
170		a1 = _mm_xor_si128(a1, a2);
171
172		/*
173		* Fold 128-bits to 64-bits.
174		*/
175		a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
176		a3 = _mm_setr_epi32(~0, 0, ~0, 0);
177		a1 = _mm_srli_si128(a1, 8);
178		a1 = _mm_xor_si128(a1, a2);
179
180		a0 = _mm_loadl_epi64((__m128i*)k7k8);
181		a2 = _mm_srli_si128(a1, 4);
182		a1 = _mm_and_si128(a1, a3);
183		a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
184		a1 = _mm_xor_si128(a1, a2);
185
186		/*
187		* Barret reduce to 32-bits.
188		*/
189		a0 = _mm_load_si128((__m128i*)poly);
190
191		a2 = _mm_and_si128(a1, a3);
192		a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
193		a2 = _mm_and_si128(a2, a3);
194		a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
195		a1 = _mm_xor_si128(a1, a2);
196
197		/*
198		* Return the crc32.
199		*/
200		return _mm_extract_epi32(a1, 1);
201		}
202
203		#elif defined(CRC32_SIMD_SSE42_PCLMUL)
204
205		/*
206		* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
207		* length must be at least 64, and a multiple of 16.
208		*/
209
210		#include <emmintrin.h>
211		#include <smmintrin.h>
212		#include <wmmintrin.h>
213
214		uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
215		const unsigned char *buf,
216		z_size_t len,
217		uint32_t crc)
218	90.3k	{
219		/*
220		* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
221		* the CRC32+Barrett polynomials given at the end of the paper.
222		*/
223	90.3k	static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
224	90.3k	static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
225	90.3k	static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
226	90.3k	static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
227
228	90.3k	__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
229
230		/*
231		* There's at least one block of 64.
232		*/
233	90.3k	x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
234	90.3k	x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
235	90.3k	x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
236	90.3k	x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
237
238	90.3k	x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
239
240	90.3k	x0 = _mm_load_si128((__m128i *)k1k2);
241
242	90.3k	buf += 64;
243	90.3k	len -= 64;
244
245		/*
246		* Parallel fold blocks of 64, if any.
247		*/
248	999k	while (len >= 64)
249	908k	{
250	908k	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
251	908k	x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
252	908k	x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
253	908k	x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
254
255	908k	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
256	908k	x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
257	908k	x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
258	908k	x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
259
260	908k	y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
261	908k	y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
262	908k	y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
263	908k	y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
264
265	908k	x1 = _mm_xor_si128(x1, x5);
266	908k	x2 = _mm_xor_si128(x2, x6);
267	908k	x3 = _mm_xor_si128(x3, x7);
268	908k	x4 = _mm_xor_si128(x4, x8);
269
270	908k	x1 = _mm_xor_si128(x1, y5);
271	908k	x2 = _mm_xor_si128(x2, y6);
272	908k	x3 = _mm_xor_si128(x3, y7);
273	908k	x4 = _mm_xor_si128(x4, y8);
274
275	908k	buf += 64;
276	908k	len -= 64;
277	908k	}
278
279		/*
280		* Fold into 128-bits.
281		*/
282	90.3k	x0 = _mm_load_si128((__m128i *)k3k4);
283
284	90.3k	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
285	90.3k	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
286	90.3k	x1 = _mm_xor_si128(x1, x2);
287	90.3k	x1 = _mm_xor_si128(x1, x5);
288
289	90.3k	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
290	90.3k	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
291	90.3k	x1 = _mm_xor_si128(x1, x3);
292	90.3k	x1 = _mm_xor_si128(x1, x5);
293
294	90.3k	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
295	90.3k	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
296	90.3k	x1 = _mm_xor_si128(x1, x4);
297	90.3k	x1 = _mm_xor_si128(x1, x5);
298
299		/*
300		* Single fold blocks of 16, if any.
301		*/
302	173k	while (len >= 16)
303	82.9k	{
304	82.9k	x2 = _mm_loadu_si128((__m128i *)buf);
305
306	82.9k	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
307	82.9k	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
308	82.9k	x1 = _mm_xor_si128(x1, x2);
309	82.9k	x1 = _mm_xor_si128(x1, x5);
310
311	82.9k	buf += 16;
312	82.9k	len -= 16;
313	82.9k	}
314
315		/*
316		* Fold 128-bits to 64-bits.
317		*/
318	90.3k	x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
319	90.3k	x3 = _mm_setr_epi32(~0, 0, ~0, 0);
320	90.3k	x1 = _mm_srli_si128(x1, 8);
321	90.3k	x1 = _mm_xor_si128(x1, x2);
322
323	90.3k	x0 = _mm_loadl_epi64((__m128i*)k5k0);
324
325	90.3k	x2 = _mm_srli_si128(x1, 4);
326	90.3k	x1 = _mm_and_si128(x1, x3);
327	90.3k	x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
328	90.3k	x1 = _mm_xor_si128(x1, x2);
329
330		/*
331		* Barret reduce to 32-bits.
332		*/
333	90.3k	x0 = _mm_load_si128((__m128i*)poly);
334
335	90.3k	x2 = _mm_and_si128(x1, x3);
336	90.3k	x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
337	90.3k	x2 = _mm_and_si128(x2, x3);
338	90.3k	x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
339	90.3k	x1 = _mm_xor_si128(x1, x2);
340
341		/*
342		* Return the crc32.
343		*/
344	90.3k	return _mm_extract_epi32(x1, 1);
345	90.3k	}
346
347		#elif defined(CRC32_ARMV8_CRC32)
348
349		/* CRC32 checksums using ARMv8-a crypto instructions.
350		*/
351
352		#if defined(__clang__)
353		/* We need some extra types for using PMULL.
354		*/
355		#if defined(__aarch64__)
356		#include <arm_neon.h>
357		#include <arm_acle.h>
358		#endif
359
360		/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
361		* armv8 target, which is incompatible with ThinLTO optimizations on Android.
362		* (Namely, mixing and matching different module-level targets makes ThinLTO
363		* warn, and Android defaults to armv7-a. This restriction does not apply to
364		* function-level `target`s, however.)
365		*
366		* Since we only need four crc intrinsics, and since clang's implementation of
367		* those are just wrappers around compiler builtins, it's simplest to #define
368		* those builtins directly. If this #define list grows too much (or we depend on
369		* an intrinsic that isn't a trivial wrapper), we may have to find a better way
370		* to go about this.
371		*
372		* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
373		* feature for this target (ignoring feature)." This appears to be a harmless
374		* bug in clang.
375		*
376		* These definitions must appear after including arm_acle.h otherwise that
377		* header may end up defining functions named __builtin_arm_crc32* that call
378		* themselves, creating an infinite loop when the intrinsic is called.
379		*/
380		/* XXX: Cannot hook into builtins with XCode for arm64. */
381		#if !defined(ARMV8_OS_MACOS)
382		#define __crc32b __builtin_arm_crc32b
383		#define __crc32d __builtin_arm_crc32d
384		#define __crc32w __builtin_arm_crc32w
385		#define __crc32cw __builtin_arm_crc32cw
386		#endif
387
388		#if defined(__aarch64__)
389		#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
390		#else // !defined(__aarch64__)
391		#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
392		#endif // defined(__aarch64__)
393
394		#elif defined(__GNUC__)
395		/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
396		* allowed. We can just include arm_acle.h.
397		*/
398		#include <arm_acle.h>
399		#include <arm_neon.h>
400		#define TARGET_ARMV8_WITH_CRC
401		#else // !defined(__GNUC__) && !defined(_aarch64__)
402		#error ARM CRC32 SIMD extensions only supported for Clang and GCC
403		#endif
404
405		TARGET_ARMV8_WITH_CRC
406		uint32_t ZLIB_INTERNAL armv8_crc32_little(
407		const unsigned char *buf,
408		z_size_t len,
409		uint32_t crc)
410		{
411		uint32_t c = (uint32_t) ~crc;
412
413		while (len && ((uintptr_t)buf & 7)) {
414		c = __crc32b(c, *buf++);
415		--len;
416		}
417
418		const uint64_t buf8 = (const uint64_t )buf;
419
420		while (len >= 64) {
421		c = __crc32d(c, *buf8++);
422		c = __crc32d(c, *buf8++);
423		c = __crc32d(c, *buf8++);
424		c = __crc32d(c, *buf8++);
425
426		c = __crc32d(c, *buf8++);
427		c = __crc32d(c, *buf8++);
428		c = __crc32d(c, *buf8++);
429		c = __crc32d(c, *buf8++);
430		len -= 64;
431		}
432
433		while (len >= 8) {
434		c = __crc32d(c, *buf8++);
435		len -= 8;
436		}
437
438		buf = (const unsigned char *)buf8;
439
440		while (len--) {
441		c = __crc32b(c, *buf++);
442		}
443
444		return ~c;
445		}
446
447		#if defined(__aarch64__) \|\| defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
448
449		/*
450		* crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
451		* length must be at least 64, and a multiple of 16. Based on:
452		*
453		* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
454		* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
455		*/
456		TARGET_ARMV8_WITH_CRC
457		static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
458		{
459		uint8x16_t r;
460		__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
461		: "=w" (r) : "w" (a), "w" (b) );
462		return r;
463		}
464
465		TARGET_ARMV8_WITH_CRC
466		static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
467		{
468		uint8x16_t r;
469		__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
470		: "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
471		return r;
472		}
473
474		TARGET_ARMV8_WITH_CRC
475		static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
476		{
477		uint8x16_t r;
478		__asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
479		: "=w" (r) : "w" (a), "w" (b) );
480		return r;
481		}
482
483		TARGET_ARMV8_WITH_CRC
484		uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
485		const unsigned char *buf,
486		z_size_t len,
487		uint32_t crc)
488		{
489		/*
490		* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
491		* the CRC32+Barrett polynomials given at the end of the paper.
492		*/
493		static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
494		static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
495		static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
496		static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
497
498		uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
499
500		/*
501		* There's at least one block of 64.
502		*/
503		x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
504		x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
505		x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
506		x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
507
508		x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
509
510		x0 = vld1q_u64(k1k2);
511
512		buf += 64;
513		len -= 64;
514
515		/*
516		* Parallel fold blocks of 64, if any.
517		*/
518		while (len >= 64)
519		{
520		x5 = (uint64x2_t) pmull_lo(x1, x0);
521		x6 = (uint64x2_t) pmull_lo(x2, x0);
522		x7 = (uint64x2_t) pmull_lo(x3, x0);
523		x8 = (uint64x2_t) pmull_lo(x4, x0);
524
525		y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
526		y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
527		y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
528		y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
529
530		x1 = (uint64x2_t) pmull_hi(x1, x0);
531		x2 = (uint64x2_t) pmull_hi(x2, x0);
532		x3 = (uint64x2_t) pmull_hi(x3, x0);
533		x4 = (uint64x2_t) pmull_hi(x4, x0);
534
535		x1 = veorq_u64(x1, x5);
536		x2 = veorq_u64(x2, x6);
537		x3 = veorq_u64(x3, x7);
538		x4 = veorq_u64(x4, x8);
539
540		x1 = veorq_u64(x1, y5);
541		x2 = veorq_u64(x2, y6);
542		x3 = veorq_u64(x3, y7);
543		x4 = veorq_u64(x4, y8);
544
545		buf += 64;
546		len -= 64;
547		}
548
549		/*
550		* Fold into 128-bits.
551		*/
552		x0 = vld1q_u64(k3k4);
553
554		x5 = (uint64x2_t) pmull_lo(x1, x0);
555		x1 = (uint64x2_t) pmull_hi(x1, x0);
556		x1 = veorq_u64(x1, x2);
557		x1 = veorq_u64(x1, x5);
558
559		x5 = (uint64x2_t) pmull_lo(x1, x0);
560		x1 = (uint64x2_t) pmull_hi(x1, x0);
561		x1 = veorq_u64(x1, x3);
562		x1 = veorq_u64(x1, x5);
563
564		x5 = (uint64x2_t) pmull_lo(x1, x0);
565		x1 = (uint64x2_t) pmull_hi(x1, x0);
566		x1 = veorq_u64(x1, x4);
567		x1 = veorq_u64(x1, x5);
568
569		/*
570		* Single fold blocks of 16, if any.
571		*/
572		while (len >= 16)
573		{
574		x2 = vld1q_u64((const uint64_t *)buf);
575
576		x5 = (uint64x2_t) pmull_lo(x1, x0);
577		x1 = (uint64x2_t) pmull_hi(x1, x0);
578		x1 = veorq_u64(x1, x2);
579		x1 = veorq_u64(x1, x5);
580
581		buf += 16;
582		len -= 16;
583		}
584
585		/*
586		* Fold 128-bits to 64-bits.
587		*/
588		static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
589
590		x2 = (uint64x2_t) pmull_01(x1, x0);
591		x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
592		x3 = (uint64x2_t) vld1q_u32(mask);
593		x1 = veorq_u64(x1, x2);
594
595		x0 = vld1q_u64(k5k0);
596
597		x2 = (uint64x2_t) pmull_01(x2, x0);
598		x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
599		x1 = vandq_u64(x1, x3);
600		x1 = (uint64x2_t) pmull_lo(x1, x0);
601		x1 = veorq_u64(x1, x2);
602
603		/*
604		* Barret reduce to 32-bits.
605		*/
606		x0 = vld1q_u64(poly);
607
608		x2 = vandq_u64(x1, x3);
609		x2 = (uint64x2_t) pmull_01(x2, x0);
610		x2 = vandq_u64(x2, x3);
611		x2 = (uint64x2_t) pmull_lo(x2, x0);
612		x1 = veorq_u64(x1, x2);
613
614		/*
615		* Return the crc32.
616		*/
617		return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
618		}
619		#endif /* aarch64 specific code. */
620
621		#endif