Coverage Report

Created: 2025-08-26 06:46

/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
Line
Count
Source (jump to first uncovered line)
1
/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template.
2
 * Copyright Wangyang Guo (wangyang.guo@intel.com)
3
 * For conditions of distribution and use, see copyright notice in zlib.h
4
 */
5
6
#ifdef COPY
7
static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
8
0
    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
9
#else
10
static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
11
    __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
12
0
    __m128i init_crc, int32_t first) {
13
0
    __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
14
0
#endif
15
0
    __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
16
0
    __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
17
0
    __m512i z0, z1, z2, z3;
18
0
    size_t len_tmp = len;
19
0
    const __m512i zmm_fold4 = _mm512_set4_epi32(
20
0
        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
21
0
    const __m512i zmm_fold16 = _mm512_set4_epi32(
22
0
        0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
23
24
    // zmm register init
25
0
    zmm_crc0 = _mm512_setzero_si512();
26
0
    zmm_t0 = _mm512_loadu_si512((__m512i *)src);
27
#ifndef COPY
28
0
    XOR_INITIAL512(zmm_t0);
29
#endif
30
0
    zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
31
0
    zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
32
0
    zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
33
34
    /* already have intermediate CRC in xmm registers
35
        * fold4 with 4 xmm_crc to get zmm_crc0
36
    */
37
0
    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
38
0
    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
39
0
    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
40
0
    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
41
0
    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
42
0
    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
43
0
    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
44
45
#ifdef COPY
46
    _mm512_storeu_si512((__m512i *)dst, zmm_t0);
47
    _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
48
    _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
49
    _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
50
    dst += 256;
51
#endif
52
0
    len -= 256;
53
0
    src += 256;
54
55
    // fold-16 loops
56
0
    while (len >= 256) {
57
0
        zmm_t0 = _mm512_loadu_si512((__m512i *)src);
58
0
        zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
59
0
        zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
60
0
        zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
61
62
0
        z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
63
0
        z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
64
0
        z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
65
0
        z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
66
67
0
        zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
68
0
        zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
69
0
        zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
70
0
        zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
71
72
0
        zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
73
0
        zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96);
74
0
        zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96);
75
0
        zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96);
76
77
#ifdef COPY
78
        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
79
        _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
80
        _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
81
        _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
82
        dst += 256;
83
#endif
84
0
        len -= 256;
85
0
        src += 256;
86
0
    }
87
    // zmm_crc[0,1,2,3] -> zmm_crc0
88
0
    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
89
0
    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
90
0
    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96);
91
92
0
    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
93
0
    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
94
0
    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96);
95
96
0
    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
97
0
    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
98
0
    zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96);
99
100
    // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
101
0
    *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
102
0
    *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
103
0
    *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
104
0
    *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
105
106
0
    return (len_tmp - len);  // return n bytes processed
107
0
}