/src/zlib-ng/arch/x86/adler32_avx512_vnni.c
Line | Count | Source |
1 | | /* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream |
2 | | * Based on Brian Bockelman's AVX2 version |
3 | | * Copyright (C) 1995-2011 Mark Adler |
4 | | * Authors: |
5 | | * Adam Stylinski <kungfujesus06@gmail.com> |
6 | | * Brian Bockelman <bockelman@gmail.com> |
7 | | * For conditions of distribution and use, see copyright notice in zlib.h |
8 | | */ |
9 | | |
10 | | #ifdef X86_AVX512VNNI |
11 | | |
12 | | #include "zbuild.h" |
13 | | #include "adler32_p.h" |
14 | | #include "arch_functions.h" |
15 | | #include <immintrin.h> |
16 | | #include "x86_intrins.h" |
17 | | #include "adler32_avx512_p.h" |
18 | | #include "adler32_avx2_p.h" |
19 | | |
20 | 0 | Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { |
21 | 0 | uint32_t adler0, adler1; |
22 | 0 | adler1 = (adler >> 16) & 0xffff; |
23 | 0 | adler0 = adler & 0xffff; |
24 | |
|
25 | 0 | rem_peel: |
26 | 0 | if (len < 32) |
27 | 0 | return adler32_ssse3(adler, src, len); |
28 | | |
29 | 0 | if (len < 64) |
30 | 0 | return adler32_avx2(adler, src, len); |
31 | | |
32 | 0 | const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, |
33 | 0 | 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, |
34 | 0 | 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, |
35 | 0 | 56, 57, 58, 59, 60, 61, 62, 63, 64); |
36 | |
|
37 | 0 | const __m512i zero = _mm512_setzero_si512(); |
38 | 0 | __m512i vs1, vs2; |
39 | |
|
40 | 0 | while (len >= 64) { |
41 | 0 | vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); |
42 | 0 | vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); |
43 | 0 | size_t k = ALIGN_DOWN(MIN(len, NMAX), 64); |
44 | 0 | len -= k; |
45 | 0 | __m512i vs1_0 = vs1; |
46 | 0 | __m512i vs3 = _mm512_setzero_si512(); |
47 | | /* We might get a tad bit more ILP here if we sum to a second register in the loop */ |
48 | 0 | __m512i vs2_1 = _mm512_setzero_si512(); |
49 | 0 | __m512i vbuf0, vbuf1; |
50 | | |
51 | | /* Remainder peeling */ |
52 | 0 | if (k % 128) { |
53 | 0 | vbuf1 = _mm512_loadu_si512((__m512i*)src); |
54 | |
|
55 | 0 | src += 64; |
56 | 0 | k -= 64; |
57 | |
|
58 | 0 | __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero); |
59 | 0 | vs1 = _mm512_add_epi32(vs1, vs1_sad); |
60 | 0 | vs3 = _mm512_add_epi32(vs3, vs1_0); |
61 | 0 | vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v); |
62 | 0 | vs1_0 = vs1; |
63 | 0 | } |
64 | | |
65 | | /* Manually unrolled this loop by 2 for an decent amount of ILP */ |
66 | 0 | while (k >= 128) { |
67 | | /* |
68 | | vs1 = adler + sum(c[i]) |
69 | | vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) |
70 | | */ |
71 | 0 | vbuf0 = _mm512_loadu_si512((__m512i*)src); |
72 | 0 | vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64)); |
73 | 0 | src += 128; |
74 | 0 | k -= 128; |
75 | |
|
76 | 0 | __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); |
77 | 0 | vs1 = _mm512_add_epi32(vs1, vs1_sad); |
78 | 0 | vs3 = _mm512_add_epi32(vs3, vs1_0); |
79 | | /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp |
80 | | * instructions to eliminate them */ |
81 | 0 | vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v); |
82 | |
|
83 | 0 | vs3 = _mm512_add_epi32(vs3, vs1); |
84 | 0 | vs1_sad = _mm512_sad_epu8(vbuf1, zero); |
85 | 0 | vs1 = _mm512_add_epi32(vs1, vs1_sad); |
86 | 0 | vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v); |
87 | 0 | vs1_0 = vs1; |
88 | 0 | } |
89 | |
|
90 | 0 | vs3 = _mm512_slli_epi32(vs3, 6); |
91 | 0 | vs2 = _mm512_add_epi32(vs2, vs3); |
92 | 0 | vs2 = _mm512_add_epi32(vs2, vs2_1); |
93 | |
|
94 | 0 | adler0 = partial_hsum(vs1) % BASE; |
95 | 0 | adler1 = _mm512_reduce_add_epu32(vs2) % BASE; |
96 | 0 | } |
97 | |
|
98 | 0 | adler = adler0 | (adler1 << 16); |
99 | | |
100 | | /* Process tail (len < 64). */ |
101 | 0 | if (len) { |
102 | 0 | goto rem_peel; |
103 | 0 | } |
104 | | |
105 | 0 | return adler; |
106 | 0 | } |
107 | | |
108 | | /* Use 256-bit vectors when copying because 512-bit variant is slower. */ |
109 | 0 | Z_INTERNAL uint32_t adler32_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
110 | 0 | uint32_t adler0, adler1; |
111 | 0 | adler1 = (adler >> 16) & 0xffff; |
112 | 0 | adler0 = adler & 0xffff; |
113 | |
|
114 | 0 | rem_peel_copy: |
115 | 0 | if (len < 32) { |
116 | | /* This handles the remaining copies, just call normal adler checksum after this */ |
117 | 0 | __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len)); |
118 | 0 | __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); |
119 | 0 | _mm256_mask_storeu_epi8(dst, storemask, copy_vec); |
120 | |
|
121 | 0 | return adler32_ssse3(adler, src, len); |
122 | 0 | } |
123 | | |
124 | 0 | const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, |
125 | 0 | 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); |
126 | |
|
127 | 0 | const __m256i zero = _mm256_setzero_si256(); |
128 | 0 | __m256i vs1, vs2; |
129 | |
|
130 | 0 | while (len >= 32) { |
131 | 0 | vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); |
132 | 0 | vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); |
133 | |
|
134 | 0 | size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); |
135 | 0 | len -= k; |
136 | |
|
137 | 0 | __m256i vs1_0 = vs1; |
138 | 0 | __m256i vs3 = _mm256_setzero_si256(); |
139 | | /* We might get a tad bit more ILP here if we sum to a second register in the loop */ |
140 | 0 | __m256i vs2_1 = _mm256_setzero_si256(); |
141 | 0 | __m256i vbuf0, vbuf1; |
142 | | |
143 | | /* Remainder peeling */ |
144 | 0 | if (k % 64) { |
145 | 0 | vbuf1 = _mm256_loadu_si256((__m256i*)src); |
146 | 0 | _mm256_storeu_si256((__m256i*)dst, vbuf1); |
147 | 0 | dst += 32; |
148 | |
|
149 | 0 | src += 32; |
150 | 0 | k -= 32; |
151 | |
|
152 | 0 | __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero); |
153 | 0 | vs1 = _mm256_add_epi32(vs1, vs1_sad); |
154 | 0 | vs3 = _mm256_add_epi32(vs3, vs1_0); |
155 | 0 | vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v); |
156 | 0 | vs1_0 = vs1; |
157 | 0 | } |
158 | | |
159 | | /* Manually unrolled this loop by 2 for an decent amount of ILP */ |
160 | 0 | while (k >= 64) { |
161 | | /* |
162 | | vs1 = adler + sum(c[i]) |
163 | | vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) |
164 | | */ |
165 | 0 | vbuf0 = _mm256_loadu_si256((__m256i*)src); |
166 | 0 | vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32)); |
167 | 0 | _mm256_storeu_si256((__m256i*)dst, vbuf0); |
168 | 0 | _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1); |
169 | 0 | dst += 64; |
170 | 0 | src += 64; |
171 | 0 | k -= 64; |
172 | |
|
173 | 0 | __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero); |
174 | 0 | vs1 = _mm256_add_epi32(vs1, vs1_sad); |
175 | 0 | vs3 = _mm256_add_epi32(vs3, vs1_0); |
176 | | /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp |
177 | | * instructions to eliminate them */ |
178 | 0 | vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v); |
179 | |
|
180 | 0 | vs3 = _mm256_add_epi32(vs3, vs1); |
181 | 0 | vs1_sad = _mm256_sad_epu8(vbuf1, zero); |
182 | 0 | vs1 = _mm256_add_epi32(vs1, vs1_sad); |
183 | 0 | vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v); |
184 | 0 | vs1_0 = vs1; |
185 | 0 | } |
186 | |
|
187 | 0 | vs3 = _mm256_slli_epi32(vs3, 5); |
188 | 0 | vs2 = _mm256_add_epi32(vs2, vs3); |
189 | 0 | vs2 = _mm256_add_epi32(vs2, vs2_1); |
190 | |
|
191 | 0 | adler0 = partial_hsum256(vs1) % BASE; |
192 | 0 | adler1 = hsum256(vs2) % BASE; |
193 | 0 | } |
194 | |
|
195 | 0 | adler = adler0 | (adler1 << 16); |
196 | | |
197 | | /* Process tail (len < 64). */ |
198 | 0 | if (len) { |
199 | 0 | goto rem_peel_copy; |
200 | 0 | } |
201 | | |
202 | 0 | return adler; |
203 | 0 | } |
204 | | |
205 | | #endif |