/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq_tpl.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template. |
2 | | * Copyright Wangyang Guo (wangyang.guo@intel.com) |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | |
6 | | #ifdef COPY |
7 | | static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, |
8 | 0 | __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { |
9 | | #else |
10 | | static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, |
11 | | __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, |
12 | 0 | __m128i init_crc, int32_t first) { |
13 | 0 | __m512i zmm_initial = _mm512_zextsi128_si512(init_crc); |
14 | 0 | #endif |
15 | 0 | __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; |
16 | 0 | __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; |
17 | 0 | __m512i z0, z1, z2, z3; |
18 | 0 | size_t len_tmp = len; |
19 | 0 | const __m512i zmm_fold4 = _mm512_set4_epi32( |
20 | 0 | 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
21 | 0 | const __m512i zmm_fold16 = _mm512_set4_epi32( |
22 | 0 | 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); |
23 | | |
24 | | // zmm register init |
25 | 0 | zmm_crc0 = _mm512_setzero_si512(); |
26 | 0 | zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
27 | | #ifndef COPY |
28 | 0 | XOR_INITIAL512(zmm_t0); |
29 | | #endif |
30 | 0 | zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); |
31 | 0 | zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); |
32 | 0 | zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); |
33 | | |
34 | | /* already have intermediate CRC in xmm registers |
35 | | * fold4 with 4 xmm_crc to get zmm_crc0 |
36 | | */ |
37 | 0 | zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); |
38 | 0 | zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); |
39 | 0 | zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); |
40 | 0 | zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); |
41 | 0 | z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
42 | 0 | zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
43 | 0 | zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); |
44 | |
|
45 | | #ifdef COPY |
46 | | _mm512_storeu_si512((__m512i *)dst, zmm_t0); |
47 | | _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); |
48 | | _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); |
49 | | _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); |
50 | | dst += 256; |
51 | | #endif |
52 | 0 | len -= 256; |
53 | 0 | src += 256; |
54 | | |
55 | | // fold-16 loops |
56 | 0 | while (len >= 256) { |
57 | 0 | zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
58 | 0 | zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); |
59 | 0 | zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); |
60 | 0 | zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); |
61 | |
|
62 | 0 | z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); |
63 | 0 | z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); |
64 | 0 | z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); |
65 | 0 | z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); |
66 | |
|
67 | 0 | zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); |
68 | 0 | zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); |
69 | 0 | zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); |
70 | 0 | zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); |
71 | |
|
72 | 0 | zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96); |
73 | 0 | zmm_crc1 = _mm512_ternarylogic_epi32(zmm_crc1, z1, zmm_t1, 0x96); |
74 | 0 | zmm_crc2 = _mm512_ternarylogic_epi32(zmm_crc2, z2, zmm_t2, 0x96); |
75 | 0 | zmm_crc3 = _mm512_ternarylogic_epi32(zmm_crc3, z3, zmm_t3, 0x96); |
76 | |
|
77 | | #ifdef COPY |
78 | | _mm512_storeu_si512((__m512i *)dst, zmm_t0); |
79 | | _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); |
80 | | _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); |
81 | | _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); |
82 | | dst += 256; |
83 | | #endif |
84 | 0 | len -= 256; |
85 | 0 | src += 256; |
86 | 0 | } |
87 | | // zmm_crc[0,1,2,3] -> zmm_crc0 |
88 | 0 | z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
89 | 0 | zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
90 | 0 | zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc1, 0x96); |
91 | |
|
92 | 0 | z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
93 | 0 | zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
94 | 0 | zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc2, 0x96); |
95 | |
|
96 | 0 | z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
97 | 0 | zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
98 | 0 | zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_crc3, 0x96); |
99 | | |
100 | | // zmm_crc0 -> xmm_crc[0, 1, 2, 3] |
101 | 0 | *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); |
102 | 0 | *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); |
103 | 0 | *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); |
104 | 0 | *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); |
105 | |
|
106 | 0 | return (len_tmp - len); // return n bytes processed |
107 | 0 | } |