/src/libdeflate/lib/x86/adler32_template.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * x86/adler32_template.h - template for vectorized Adler-32 implementations |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | */ |
27 | | |
28 | | /* |
29 | | * This file is a "template" for instantiating Adler-32 functions for x86. |
30 | | * The "parameters" are: |
31 | | * |
32 | | * SUFFIX: |
33 | | * Name suffix to append to all instantiated functions. |
34 | | * ATTRIBUTES: |
35 | | * Target function attributes to use. Must satisfy the dependencies of the |
36 | | * other parameters as follows: |
37 | | * VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2 |
38 | | * VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2 |
39 | | * VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni |
40 | | * VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni |
41 | | * VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni |
42 | | * (Other combinations are not useful and have not been tested.) |
43 | | * VL: |
44 | | * Vector length in bytes. Must be 16, 32, or 64. |
45 | | * USE_VNNI: |
46 | | * If 1, use the VNNI dot product based algorithm. |
47 | | * If 0, use the legacy SSE2 and AVX2 compatible algorithm. |
48 | | * USE_AVX512: |
49 | | * If 1, take advantage of AVX-512 features such as masking. This doesn't |
50 | | * enable the use of 512-bit vectors; the vector length is controlled by |
51 | | * VL. If 0, assume that the CPU might not support AVX-512. |
52 | | */ |
53 | | |
54 | | #if VL == 16 |
55 | 0 | # define vec_t __m128i |
56 | | # define mask_t u16 |
57 | | # define LOG2_VL 4 |
58 | | # define VADD8(a, b) _mm_add_epi8((a), (b)) |
59 | 0 | # define VADD16(a, b) _mm_add_epi16((a), (b)) |
60 | 0 | # define VADD32(a, b) _mm_add_epi32((a), (b)) |
61 | | # if USE_AVX512 |
62 | | # define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c)) |
63 | | # else |
64 | | # define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c)) |
65 | | # endif |
66 | 0 | # define VLOAD(p) _mm_load_si128((const void *)(p)) |
67 | 0 | # define VLOADU(p) _mm_loadu_si128((const void *)(p)) |
68 | | # define VMADD16(a, b) _mm_madd_epi16((a), (b)) |
69 | | # define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p)) |
70 | | # define VMULLO32(a, b) _mm_mullo_epi32((a), (b)) |
71 | | # define VSAD8(a, b) _mm_sad_epu8((a), (b)) |
72 | | # define VSET1_8(a) _mm_set1_epi8(a) |
73 | | # define VSET1_32(a) _mm_set1_epi32(a) |
74 | 0 | # define VSETZERO() _mm_setzero_si128() |
75 | | # define VSLL32(a, b) _mm_slli_epi32((a), (b)) |
76 | | # define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b)) |
77 | | # define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b)) |
78 | | #elif VL == 32 |
79 | 602k | # define vec_t __m256i |
80 | | # define mask_t u32 |
81 | | # define LOG2_VL 5 |
82 | 0 | # define VADD8(a, b) _mm256_add_epi8((a), (b)) |
83 | 1.14M | # define VADD16(a, b) _mm256_add_epi16((a), (b)) |
84 | 578k | # define VADD32(a, b) _mm256_add_epi32((a), (b)) |
85 | | # if USE_AVX512 |
86 | 0 | # define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c)) |
87 | | # else |
88 | 0 | # define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c)) |
89 | | # endif |
90 | 4.16k | # define VLOAD(p) _mm256_load_si256((const void *)(p)) |
91 | 574k | # define VLOADU(p) _mm256_loadu_si256((const void *)(p)) |
92 | | # define VMADD16(a, b) _mm256_madd_epi16((a), (b)) |
93 | 0 | # define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p)) |
94 | | # define VMULLO32(a, b) _mm256_mullo_epi32((a), (b)) |
95 | | # define VSAD8(a, b) _mm256_sad_epu8((a), (b)) |
96 | 0 | # define VSET1_8(a) _mm256_set1_epi8(a) |
97 | | # define VSET1_32(a) _mm256_set1_epi32(a) |
98 | 1.04k | # define VSETZERO() _mm256_setzero_si256() |
99 | | # define VSLL32(a, b) _mm256_slli_epi32((a), (b)) |
100 | | # define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b)) |
101 | | # define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b)) |
102 | | #elif VL == 64 |
103 | 0 | # define vec_t __m512i |
104 | | # define mask_t u64 |
105 | | # define LOG2_VL 6 |
106 | 0 | # define VADD8(a, b) _mm512_add_epi8((a), (b)) |
107 | | # define VADD16(a, b) _mm512_add_epi16((a), (b)) |
108 | 0 | # define VADD32(a, b) _mm512_add_epi32((a), (b)) |
109 | 0 | # define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c)) |
110 | 0 | # define VLOAD(p) _mm512_load_si512((const void *)(p)) |
111 | 0 | # define VLOADU(p) _mm512_loadu_si512((const void *)(p)) |
112 | | # define VMADD16(a, b) _mm512_madd_epi16((a), (b)) |
113 | 0 | # define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p)) |
114 | | # define VMULLO32(a, b) _mm512_mullo_epi32((a), (b)) |
115 | | # define VSAD8(a, b) _mm512_sad_epu8((a), (b)) |
116 | 0 | # define VSET1_8(a) _mm512_set1_epi8(a) |
117 | | # define VSET1_32(a) _mm512_set1_epi32(a) |
118 | 0 | # define VSETZERO() _mm512_setzero_si512() |
119 | | # define VSLL32(a, b) _mm512_slli_epi32((a), (b)) |
120 | | # define VUNPACKLO8(a, b) _mm512_unpacklo_epi8((a), (b)) |
121 | | # define VUNPACKHI8(a, b) _mm512_unpackhi_epi8((a), (b)) |
122 | | #else |
123 | | # error "unsupported vector length" |
124 | | #endif |
125 | | |
126 | 0 | #define VADD32_3X(a, b, c) VADD32(VADD32((a), (b)), (c)) |
127 | 0 | #define VADD32_4X(a, b, c, d) VADD32(VADD32((a), (b)), VADD32((c), (d))) |
128 | 3.90k | #define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e))) |
129 | | #define VADD32_7X(a, b, c, d, e, f, g) \ |
130 | 0 | VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g))) |
131 | | |
132 | | /* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */ |
133 | | #undef reduce_to_32bits |
134 | | static forceinline ATTRIBUTES void |
135 | | ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p) |
136 | 3.90k | { |
137 | 3.90k | __m128i v_s1_128, v_s2_128; |
138 | | #if VL == 16 |
139 | | { |
140 | | v_s1_128 = v_s1; |
141 | | v_s2_128 = v_s2; |
142 | | } |
143 | | #else |
144 | | { |
145 | | __m256i v_s1_256, v_s2_256; |
146 | | #if VL == 32 |
147 | | v_s1_256 = v_s1; |
148 | | v_s2_256 = v_s2; |
149 | | #else |
150 | | /* Reduce 512 bits to 256 bits. */ |
151 | | v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0), |
152 | | _mm512_extracti64x4_epi64(v_s1, 1)); |
153 | | v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0), |
154 | | _mm512_extracti64x4_epi64(v_s2, 1)); |
155 | | #endif |
156 | | /* Reduce 256 bits to 128 bits. */ |
157 | | v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0), |
158 | | _mm256_extracti128_si256(v_s1_256, 1)); |
159 | | v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0), |
160 | | _mm256_extracti128_si256(v_s2_256, 1)); |
161 | | } |
162 | | #endif |
163 | | |
164 | | /* |
165 | | * Reduce 128 bits to 32 bits. |
166 | | * |
167 | | * If the bytes were summed into v_s1 using psadbw + paddd, then ignore |
168 | | * the odd-indexed elements of v_s1_128 since they are zero. |
169 | | */ |
170 | | #if USE_VNNI |
171 | | v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31)); |
172 | | #endif |
173 | 3.90k | v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31)); |
174 | 3.90k | v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02)); |
175 | 3.90k | v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02)); |
176 | | |
177 | 3.90k | *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128); |
178 | 3.90k | *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128); |
179 | 3.90k | } Unexecuted instantiation: adler32.c:reduce_to_32bits_avx512_vl512_vnni Unexecuted instantiation: adler32.c:reduce_to_32bits_avx512_vl256_vnni Unexecuted instantiation: adler32.c:reduce_to_32bits_avx2_vnni adler32.c:reduce_to_32bits_avx2 Line | Count | Source | 136 | 3.90k | { | 137 | 3.90k | __m128i v_s1_128, v_s2_128; | 138 | | #if VL == 16 | 139 | | { | 140 | | v_s1_128 = v_s1; | 141 | | v_s2_128 = v_s2; | 142 | | } | 143 | | #else | 144 | 3.90k | { | 145 | 3.90k | __m256i v_s1_256, v_s2_256; | 146 | 3.90k | #if VL == 32 | 147 | 3.90k | v_s1_256 = v_s1; | 148 | 3.90k | v_s2_256 = v_s2; | 149 | | #else | 150 | | /* Reduce 512 bits to 256 bits. */ | 151 | | v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0), | 152 | | _mm512_extracti64x4_epi64(v_s1, 1)); | 153 | | v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0), | 154 | | _mm512_extracti64x4_epi64(v_s2, 1)); | 155 | | #endif | 156 | | /* Reduce 256 bits to 128 bits. */ | 157 | 3.90k | v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0), | 158 | 3.90k | _mm256_extracti128_si256(v_s1_256, 1)); | 159 | 3.90k | v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0), | 160 | 3.90k | _mm256_extracti128_si256(v_s2_256, 1)); | 161 | 3.90k | } | 162 | 3.90k | #endif | 163 | | | 164 | | /* | 165 | | * Reduce 128 bits to 32 bits. | 166 | | * | 167 | | * If the bytes were summed into v_s1 using psadbw + paddd, then ignore | 168 | | * the odd-indexed elements of v_s1_128 since they are zero. | 169 | | */ | 170 | | #if USE_VNNI | 171 | | v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31)); | 172 | | #endif | 173 | 3.90k | v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31)); | 174 | 3.90k | v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02)); | 175 | 3.90k | v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02)); | 176 | | | 177 | 3.90k | *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128); | 178 | 3.90k | *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128); | 179 | 3.90k | } |
Unexecuted instantiation: adler32.c:reduce_to_32bits_sse2 |
180 | 3.90k | #define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits) |
181 | | |
182 | | static ATTRIBUTES u32 |
183 | | ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len) |
184 | 1.04k | { |
185 | | #if USE_VNNI |
186 | | /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */ |
187 | | static const u8 _aligned_attribute(VL) raw_mults[VL] = { |
188 | | #if VL == 64 |
189 | | 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, |
190 | | 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, |
191 | | #endif |
192 | | #if VL >= 32 |
193 | | 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, |
194 | | #endif |
195 | | 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, |
196 | | }; |
197 | 0 | const vec_t ones = VSET1_8(1); |
198 | | #else |
199 | | /* |
200 | | * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1]. |
201 | | * For VL==32 the ordering is weird because it has to match the way that |
202 | | * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately. |
203 | | */ |
204 | | static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = { |
205 | | #if VL == 16 |
206 | | { 32, 31, 30, 29, 28, 27, 26, 25 }, |
207 | | { 24, 23, 22, 21, 20, 19, 18, 17 }, |
208 | | { 16, 15, 14, 13, 12, 11, 10, 9 }, |
209 | | { 8, 7, 6, 5, 4, 3, 2, 1 }, |
210 | | #elif VL == 32 |
211 | | { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, |
212 | | { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, |
213 | | { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, |
214 | | { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, |
215 | | #else |
216 | | # error "unsupported parameters" |
217 | | #endif |
218 | | }; |
219 | 1.04k | const vec_t mults_a = VLOAD(raw_mults[0]); |
220 | 1.04k | const vec_t mults_b = VLOAD(raw_mults[1]); |
221 | 1.04k | const vec_t mults_c = VLOAD(raw_mults[2]); |
222 | 1.04k | const vec_t mults_d = VLOAD(raw_mults[3]); |
223 | | #endif |
224 | 1.04k | const vec_t zeroes = VSETZERO(); |
225 | 1.04k | u32 s1 = adler & 0xFFFF; |
226 | 1.04k | u32 s2 = adler >> 16; |
227 | | |
228 | | /* |
229 | | * If the length is large and the pointer is misaligned, align it. |
230 | | * For smaller lengths, just take the misaligned load penalty. |
231 | | */ |
232 | 1.04k | if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) { |
233 | 656 | do { |
234 | 656 | s1 += *p++; |
235 | 656 | s2 += s1; |
236 | 656 | len--; |
237 | 656 | } while ((uintptr_t)p & (VL-1)); |
238 | 41 | s1 %= DIVISOR; |
239 | 41 | s2 %= DIVISOR; |
240 | 41 | } |
241 | | |
242 | | #if USE_VNNI |
243 | | /* |
244 | | * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or |
245 | | * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by |
246 | | * the signed bytes of another vector and adds the sums in groups of 4 |
247 | | * to the 32-bit elements of a third vector. We use it in two ways: |
248 | | * multiplying the data bytes by a sequence like 64,63,62,...,1 for |
249 | | * calculating part of s2, and multiplying the data bytes by an all-ones |
250 | | * sequence 1,1,1,...,1 for calculating s1 and part of s2. The all-ones |
251 | | * trick seems to be faster than the alternative of vpsadbw + vpaddd. |
252 | | */ |
253 | 0 | while (len) { |
254 | | /* |
255 | | * Calculate the length of the next data chunk such that s1 and |
256 | | * s2 are guaranteed to not exceed UINT32_MAX. |
257 | | */ |
258 | 0 | size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1)); |
259 | 0 | vec_t mults = VLOAD(raw_mults); |
260 | 0 | vec_t v_s1 = zeroes; |
261 | 0 | vec_t v_s2 = zeroes; |
262 | | |
263 | | s2 += s1 * n; |
264 | | len -= n; |
265 | | |
266 | 0 | if (n >= 4*VL) { |
267 | 0 | vec_t v_s1_b = zeroes; |
268 | 0 | vec_t v_s1_c = zeroes; |
269 | 0 | vec_t v_s1_d = zeroes; |
270 | 0 | vec_t v_s2_b = zeroes; |
271 | 0 | vec_t v_s2_c = zeroes; |
272 | 0 | vec_t v_s2_d = zeroes; |
273 | 0 | vec_t v_s1_sums = zeroes; |
274 | 0 | vec_t v_s1_sums_b = zeroes; |
275 | 0 | vec_t v_s1_sums_c = zeroes; |
276 | 0 | vec_t v_s1_sums_d = zeroes; |
277 | 0 | vec_t tmp0, tmp1; |
278 | |
|
279 | 0 | do { |
280 | 0 | vec_t data_a = VLOADU(p + 0*VL); |
281 | 0 | vec_t data_b = VLOADU(p + 1*VL); |
282 | 0 | vec_t data_c = VLOADU(p + 2*VL); |
283 | 0 | vec_t data_d = VLOADU(p + 3*VL); |
284 | | |
285 | | /* |
286 | | * Workaround for gcc bug where it generates |
287 | | * unnecessary move instructions |
288 | | * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) |
289 | | */ |
290 | | #if GCC_PREREQ(1, 0) |
291 | | __asm__("" : "+v" (data_a), "+v" (data_b), |
292 | | "+v" (data_c), "+v" (data_d)); |
293 | | #endif |
294 | | |
295 | 0 | v_s2 = VDPBUSD(v_s2, data_a, mults); |
296 | 0 | v_s2_b = VDPBUSD(v_s2_b, data_b, mults); |
297 | 0 | v_s2_c = VDPBUSD(v_s2_c, data_c, mults); |
298 | 0 | v_s2_d = VDPBUSD(v_s2_d, data_d, mults); |
299 | | |
300 | 0 | v_s1_sums = VADD32(v_s1_sums, v_s1); |
301 | 0 | v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b); |
302 | 0 | v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c); |
303 | 0 | v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d); |
304 | | |
305 | 0 | v_s1 = VDPBUSD(v_s1, data_a, ones); |
306 | 0 | v_s1_b = VDPBUSD(v_s1_b, data_b, ones); |
307 | 0 | v_s1_c = VDPBUSD(v_s1_c, data_c, ones); |
308 | 0 | v_s1_d = VDPBUSD(v_s1_d, data_d, ones); |
309 | | |
310 | | /* Same gcc bug workaround. See above */ |
311 | | #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32) |
312 | | __asm__("" : "+v" (v_s2), "+v" (v_s2_b), |
313 | | "+v" (v_s2_c), "+v" (v_s2_d), |
314 | | "+v" (v_s1_sums), |
315 | | "+v" (v_s1_sums_b), |
316 | | "+v" (v_s1_sums_c), |
317 | | "+v" (v_s1_sums_d), |
318 | | "+v" (v_s1), "+v" (v_s1_b), |
319 | | "+v" (v_s1_c), "+v" (v_s1_d)); |
320 | | #endif |
321 | 0 | p += 4*VL; |
322 | 0 | n -= 4*VL; |
323 | 0 | } while (n >= 4*VL); |
324 | | |
325 | | /* |
326 | | * Reduce into v_s1 and v_s2 as follows: |
327 | | * |
328 | | * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d + |
329 | | * (4*VL)*(v_s1_sums + v_s1_sums_b + |
330 | | * v_s1_sums_c + v_s1_sums_d) + |
331 | | * (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c |
332 | | * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d |
333 | | */ |
334 | 0 | tmp0 = VADD32(v_s1, v_s1_b); |
335 | 0 | tmp1 = VADD32(v_s1, v_s1_c); |
336 | 0 | v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b, |
337 | | v_s1_sums_c, v_s1_sums_d); |
338 | 0 | v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d); |
339 | 0 | v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2), |
340 | 0 | VSLL32(tmp0, LOG2_VL + 1), |
341 | 0 | VSLL32(tmp1, LOG2_VL), |
342 | 0 | v_s2, v_s2_b, v_s2_c, v_s2_d); |
343 | 0 | } |
344 | | |
345 | | /* Process the last 0 <= n < 4*VL bytes of the chunk. */ |
346 | 0 | if (n >= 2*VL) { |
347 | 0 | const vec_t data_a = VLOADU(p + 0*VL); |
348 | 0 | const vec_t data_b = VLOADU(p + 1*VL); |
349 | |
|
350 | 0 | v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1)); |
351 | 0 | v_s1 = VDPBUSD(v_s1, data_a, ones); |
352 | 0 | v_s1 = VDPBUSD(v_s1, data_b, ones); |
353 | 0 | v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL)); |
354 | 0 | v_s2 = VDPBUSD(v_s2, data_a, mults); |
355 | 0 | v_s2 = VDPBUSD(v_s2, data_b, mults); |
356 | 0 | p += 2*VL; |
357 | 0 | n -= 2*VL; |
358 | 0 | } |
359 | 0 | if (n) { |
360 | | /* Process the last 0 < n < 2*VL bytes of the chunk. */ |
361 | 0 | vec_t data; |
362 | |
|
363 | 0 | v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n))); |
364 | |
|
365 | 0 | mults = VADD8(mults, VSET1_8((int)n - VL)); |
366 | 0 | if (n > VL) { |
367 | 0 | data = VLOADU(p); |
368 | 0 | v_s1 = VDPBUSD(v_s1, data, ones); |
369 | 0 | v_s2 = VDPBUSD(v_s2, data, mults); |
370 | 0 | p += VL; |
371 | 0 | n -= VL; |
372 | 0 | mults = VADD8(mults, VSET1_8(-VL)); |
373 | 0 | } |
374 | | /* |
375 | | * Process the last 0 < n <= VL bytes of the chunk. |
376 | | * Utilize a masked load if it's available. |
377 | | */ |
378 | | #if USE_AVX512 |
379 | 0 | data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p); |
380 | | #else |
381 | | data = zeroes; |
382 | | memcpy(&data, p, n); |
383 | | #endif |
384 | 0 | v_s1 = VDPBUSD(v_s1, data, ones); |
385 | 0 | v_s2 = VDPBUSD(v_s2, data, mults); |
386 | 0 | p += n; |
387 | 0 | } |
388 | |
|
389 | 0 | reduce_to_32bits(v_s1, v_s2, &s1, &s2); |
390 | 0 | s1 %= DIVISOR; |
391 | 0 | s2 %= DIVISOR; |
392 | 0 | } |
393 | | #else /* USE_VNNI */ |
394 | | /* |
395 | | * This is Adler-32 for SSE2 and AVX2. |
396 | | * |
397 | | * To horizontally sum bytes, use psadbw + paddd, where one of the |
398 | | * arguments to psadbw is all-zeroes. |
399 | | * |
400 | | * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL |
401 | | * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw |
402 | | * to sum, for each i across iterations, byte i into a corresponding |
403 | | * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddwd |
404 | | * to multiply each counter by (2*VL - i), then add the products to s2. |
405 | | * |
406 | | * An alternative implementation would use pmaddubsw and pmaddwd in the |
407 | | * inner loop to do (2*VL - i)*data[i] directly and add the products in |
408 | | * groups of 4 to 32-bit counters. However, on average that approach |
409 | | * seems to be slower than the current approach which delays the |
410 | | * multiplications. Also, pmaddubsw requires SSSE3; the current |
411 | | * approach keeps the implementation aligned between SSE2 and AVX2. |
412 | | * |
413 | | * The inner loop processes 2*VL bytes per iteration. Increasing this |
414 | | * to 4*VL doesn't seem to be helpful here. |
415 | | */ |
416 | 4.97k | while (len) { |
417 | | /* |
418 | | * Calculate the length of the next data chunk such that s1 and |
419 | | * s2 are guaranteed to not exceed UINT32_MAX, and every |
420 | | * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. |
421 | | * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are |
422 | | * used with pmaddwd which does signed multiplication. In the |
423 | | * SSE2 case this limits chunks to 4096 bytes instead of 5536. |
424 | | */ |
425 | 3.93k | size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), |
426 | 3.93k | MAX_CHUNK_LEN) & ~(2*VL - 1)); |
427 | 3.93k | len -= n; |
428 | | |
429 | 3.93k | if (n >= 2*VL) { |
430 | 3.90k | vec_t v_s1 = zeroes; |
431 | 3.90k | vec_t v_s1_sums = zeroes; |
432 | 3.90k | vec_t v_byte_sums_a = zeroes; |
433 | 3.90k | vec_t v_byte_sums_b = zeroes; |
434 | 3.90k | vec_t v_byte_sums_c = zeroes; |
435 | 3.90k | vec_t v_byte_sums_d = zeroes; |
436 | 3.90k | vec_t v_s2; |
437 | | |
438 | 3.90k | s2 += s1 * (n & ~(2*VL - 1)); |
439 | | |
440 | 287k | do { |
441 | 287k | vec_t data_a = VLOADU(p + 0*VL); |
442 | 287k | vec_t data_b = VLOADU(p + 1*VL); |
443 | | |
444 | 287k | v_s1_sums = VADD32(v_s1_sums, v_s1); |
445 | 287k | v_byte_sums_a = VADD16(v_byte_sums_a, |
446 | 287k | VUNPACKLO8(data_a, zeroes)); |
447 | 287k | v_byte_sums_b = VADD16(v_byte_sums_b, |
448 | 287k | VUNPACKHI8(data_a, zeroes)); |
449 | 287k | v_byte_sums_c = VADD16(v_byte_sums_c, |
450 | 287k | VUNPACKLO8(data_b, zeroes)); |
451 | 287k | v_byte_sums_d = VADD16(v_byte_sums_d, |
452 | 287k | VUNPACKHI8(data_b, zeroes)); |
453 | 287k | v_s1 = VADD32(v_s1, |
454 | 287k | VADD32(VSAD8(data_a, zeroes), |
455 | 287k | VSAD8(data_b, zeroes))); |
456 | | /* |
457 | | * Workaround for gcc bug where it generates |
458 | | * unnecessary move instructions |
459 | | * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) |
460 | | */ |
461 | | #if GCC_PREREQ(1, 0) |
462 | | __asm__("" : "+x" (v_s1), "+x" (v_s1_sums), |
463 | | "+x" (v_byte_sums_a), |
464 | | "+x" (v_byte_sums_b), |
465 | | "+x" (v_byte_sums_c), |
466 | | "+x" (v_byte_sums_d)); |
467 | | #endif |
468 | 287k | p += 2*VL; |
469 | 287k | n -= 2*VL; |
470 | 287k | } while (n >= 2*VL); |
471 | | |
472 | | /* |
473 | | * Calculate v_s2 as (2*VL)*v_s1_sums + |
474 | | * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums. |
475 | | * Then update s1 and s2 from v_s1 and v_s2. |
476 | | */ |
477 | 3.90k | v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1), |
478 | 3.90k | VMADD16(v_byte_sums_a, mults_a), |
479 | 3.90k | VMADD16(v_byte_sums_b, mults_b), |
480 | 3.90k | VMADD16(v_byte_sums_c, mults_c), |
481 | 3.90k | VMADD16(v_byte_sums_d, mults_d)); |
482 | 3.90k | reduce_to_32bits(v_s1, v_s2, &s1, &s2); |
483 | 3.90k | } |
484 | | /* |
485 | | * Process the last 0 <= n < 2*VL bytes of the chunk using |
486 | | * scalar instructions and reduce s1 and s2 mod DIVISOR. |
487 | | */ |
488 | 3.93k | ADLER32_CHUNK(s1, s2, p, n); |
489 | 3.93k | } |
490 | | #endif /* !USE_VNNI */ |
491 | 1.04k | return (s2 << 16) | s1; |
492 | 1.04k | } Unexecuted instantiation: adler32.c:adler32_x86_avx512_vl512_vnni Unexecuted instantiation: adler32.c:adler32_x86_avx512_vl256_vnni Unexecuted instantiation: adler32.c:adler32_x86_avx2_vnni adler32.c:adler32_x86_avx2 Line | Count | Source | 184 | 1.04k | { | 185 | | #if USE_VNNI | 186 | | /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */ | 187 | | static const u8 _aligned_attribute(VL) raw_mults[VL] = { | 188 | | #if VL == 64 | 189 | | 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, | 190 | | 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, | 191 | | #endif | 192 | | #if VL >= 32 | 193 | | 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, | 194 | | #endif | 195 | | 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, | 196 | | }; | 197 | | const vec_t ones = VSET1_8(1); | 198 | | #else | 199 | | /* | 200 | | * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1]. | 201 | | * For VL==32 the ordering is weird because it has to match the way that | 202 | | * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately. | 203 | | */ | 204 | 1.04k | static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = { | 205 | | #if VL == 16 | 206 | | { 32, 31, 30, 29, 28, 27, 26, 25 }, | 207 | | { 24, 23, 22, 21, 20, 19, 18, 17 }, | 208 | | { 16, 15, 14, 13, 12, 11, 10, 9 }, | 209 | | { 8, 7, 6, 5, 4, 3, 2, 1 }, | 210 | | #elif VL == 32 | 211 | | { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, | 212 | 1.04k | { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, | 213 | 1.04k | { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, | 214 | 1.04k | { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, | 215 | | #else | 216 | | # error "unsupported parameters" | 217 | | #endif | 218 | 1.04k | }; | 219 | 1.04k | const vec_t mults_a = VLOAD(raw_mults[0]); | 220 | 1.04k | const vec_t mults_b = VLOAD(raw_mults[1]); | 221 | 1.04k | const vec_t mults_c = VLOAD(raw_mults[2]); | 222 | 1.04k | const vec_t mults_d = VLOAD(raw_mults[3]); | 223 | 1.04k | #endif | 224 | 1.04k | const vec_t zeroes = VSETZERO(); | 225 | 1.04k | u32 s1 = adler & 0xFFFF; | 226 | 1.04k | u32 s2 = adler >> 16; | 227 | | | 228 | | /* | 229 | | * If the length is large and the pointer is misaligned, align it. | 230 | | * For smaller lengths, just take the misaligned load penalty. | 231 | | */ | 232 | 1.04k | if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) { | 233 | 656 | do { | 234 | 656 | s1 += *p++; | 235 | 656 | s2 += s1; | 236 | 656 | len--; | 237 | 656 | } while ((uintptr_t)p & (VL-1)); | 238 | 41 | s1 %= DIVISOR; | 239 | 41 | s2 %= DIVISOR; | 240 | 41 | } | 241 | | | 242 | | #if USE_VNNI | 243 | | /* | 244 | | * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or | 245 | | * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by | 246 | | * the signed bytes of another vector and adds the sums in groups of 4 | 247 | | * to the 32-bit elements of a third vector. We use it in two ways: | 248 | | * multiplying the data bytes by a sequence like 64,63,62,...,1 for | 249 | | * calculating part of s2, and multiplying the data bytes by an all-ones | 250 | | * sequence 1,1,1,...,1 for calculating s1 and part of s2. The all-ones | 251 | | * trick seems to be faster than the alternative of vpsadbw + vpaddd. | 252 | | */ | 253 | | while (len) { | 254 | | /* | 255 | | * Calculate the length of the next data chunk such that s1 and | 256 | | * s2 are guaranteed to not exceed UINT32_MAX. | 257 | | */ | 258 | | size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1)); | 259 | | vec_t mults = VLOAD(raw_mults); | 260 | | vec_t v_s1 = zeroes; | 261 | | vec_t v_s2 = zeroes; | 262 | | | 263 | | s2 += s1 * n; | 264 | | len -= n; | 265 | | | 266 | | if (n >= 4*VL) { | 267 | | vec_t v_s1_b = zeroes; | 268 | | vec_t v_s1_c = zeroes; | 269 | | vec_t v_s1_d = zeroes; | 270 | | vec_t v_s2_b = zeroes; | 271 | | vec_t v_s2_c = zeroes; | 272 | | vec_t v_s2_d = zeroes; | 273 | | vec_t v_s1_sums = zeroes; | 274 | | vec_t v_s1_sums_b = zeroes; | 275 | | vec_t v_s1_sums_c = zeroes; | 276 | | vec_t v_s1_sums_d = zeroes; | 277 | | vec_t tmp0, tmp1; | 278 | | | 279 | | do { | 280 | | vec_t data_a = VLOADU(p + 0*VL); | 281 | | vec_t data_b = VLOADU(p + 1*VL); | 282 | | vec_t data_c = VLOADU(p + 2*VL); | 283 | | vec_t data_d = VLOADU(p + 3*VL); | 284 | | | 285 | | /* | 286 | | * Workaround for gcc bug where it generates | 287 | | * unnecessary move instructions | 288 | | * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) | 289 | | */ | 290 | | #if GCC_PREREQ(1, 0) | 291 | | __asm__("" : "+v" (data_a), "+v" (data_b), | 292 | | "+v" (data_c), "+v" (data_d)); | 293 | | #endif | 294 | | | 295 | | v_s2 = VDPBUSD(v_s2, data_a, mults); | 296 | | v_s2_b = VDPBUSD(v_s2_b, data_b, mults); | 297 | | v_s2_c = VDPBUSD(v_s2_c, data_c, mults); | 298 | | v_s2_d = VDPBUSD(v_s2_d, data_d, mults); | 299 | | | 300 | | v_s1_sums = VADD32(v_s1_sums, v_s1); | 301 | | v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b); | 302 | | v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c); | 303 | | v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d); | 304 | | | 305 | | v_s1 = VDPBUSD(v_s1, data_a, ones); | 306 | | v_s1_b = VDPBUSD(v_s1_b, data_b, ones); | 307 | | v_s1_c = VDPBUSD(v_s1_c, data_c, ones); | 308 | | v_s1_d = VDPBUSD(v_s1_d, data_d, ones); | 309 | | | 310 | | /* Same gcc bug workaround. See above */ | 311 | | #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32) | 312 | | __asm__("" : "+v" (v_s2), "+v" (v_s2_b), | 313 | | "+v" (v_s2_c), "+v" (v_s2_d), | 314 | | "+v" (v_s1_sums), | 315 | | "+v" (v_s1_sums_b), | 316 | | "+v" (v_s1_sums_c), | 317 | | "+v" (v_s1_sums_d), | 318 | | "+v" (v_s1), "+v" (v_s1_b), | 319 | | "+v" (v_s1_c), "+v" (v_s1_d)); | 320 | | #endif | 321 | | p += 4*VL; | 322 | | n -= 4*VL; | 323 | | } while (n >= 4*VL); | 324 | | | 325 | | /* | 326 | | * Reduce into v_s1 and v_s2 as follows: | 327 | | * | 328 | | * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d + | 329 | | * (4*VL)*(v_s1_sums + v_s1_sums_b + | 330 | | * v_s1_sums_c + v_s1_sums_d) + | 331 | | * (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c | 332 | | * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d | 333 | | */ | 334 | | tmp0 = VADD32(v_s1, v_s1_b); | 335 | | tmp1 = VADD32(v_s1, v_s1_c); | 336 | | v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b, | 337 | | v_s1_sums_c, v_s1_sums_d); | 338 | | v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d); | 339 | | v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2), | 340 | | VSLL32(tmp0, LOG2_VL + 1), | 341 | | VSLL32(tmp1, LOG2_VL), | 342 | | v_s2, v_s2_b, v_s2_c, v_s2_d); | 343 | | } | 344 | | | 345 | | /* Process the last 0 <= n < 4*VL bytes of the chunk. */ | 346 | | if (n >= 2*VL) { | 347 | | const vec_t data_a = VLOADU(p + 0*VL); | 348 | | const vec_t data_b = VLOADU(p + 1*VL); | 349 | | | 350 | | v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1)); | 351 | | v_s1 = VDPBUSD(v_s1, data_a, ones); | 352 | | v_s1 = VDPBUSD(v_s1, data_b, ones); | 353 | | v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL)); | 354 | | v_s2 = VDPBUSD(v_s2, data_a, mults); | 355 | | v_s2 = VDPBUSD(v_s2, data_b, mults); | 356 | | p += 2*VL; | 357 | | n -= 2*VL; | 358 | | } | 359 | | if (n) { | 360 | | /* Process the last 0 < n < 2*VL bytes of the chunk. */ | 361 | | vec_t data; | 362 | | | 363 | | v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n))); | 364 | | | 365 | | mults = VADD8(mults, VSET1_8((int)n - VL)); | 366 | | if (n > VL) { | 367 | | data = VLOADU(p); | 368 | | v_s1 = VDPBUSD(v_s1, data, ones); | 369 | | v_s2 = VDPBUSD(v_s2, data, mults); | 370 | | p += VL; | 371 | | n -= VL; | 372 | | mults = VADD8(mults, VSET1_8(-VL)); | 373 | | } | 374 | | /* | 375 | | * Process the last 0 < n <= VL bytes of the chunk. | 376 | | * Utilize a masked load if it's available. | 377 | | */ | 378 | | #if USE_AVX512 | 379 | | data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p); | 380 | | #else | 381 | | data = zeroes; | 382 | | memcpy(&data, p, n); | 383 | | #endif | 384 | | v_s1 = VDPBUSD(v_s1, data, ones); | 385 | | v_s2 = VDPBUSD(v_s2, data, mults); | 386 | | p += n; | 387 | | } | 388 | | | 389 | | reduce_to_32bits(v_s1, v_s2, &s1, &s2); | 390 | | s1 %= DIVISOR; | 391 | | s2 %= DIVISOR; | 392 | | } | 393 | | #else /* USE_VNNI */ | 394 | | /* | 395 | | * This is Adler-32 for SSE2 and AVX2. | 396 | | * | 397 | | * To horizontally sum bytes, use psadbw + paddd, where one of the | 398 | | * arguments to psadbw is all-zeroes. | 399 | | * | 400 | | * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL | 401 | | * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw | 402 | | * to sum, for each i across iterations, byte i into a corresponding | 403 | | * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddwd | 404 | | * to multiply each counter by (2*VL - i), then add the products to s2. | 405 | | * | 406 | | * An alternative implementation would use pmaddubsw and pmaddwd in the | 407 | | * inner loop to do (2*VL - i)*data[i] directly and add the products in | 408 | | * groups of 4 to 32-bit counters. However, on average that approach | 409 | | * seems to be slower than the current approach which delays the | 410 | | * multiplications. Also, pmaddubsw requires SSSE3; the current | 411 | | * approach keeps the implementation aligned between SSE2 and AVX2. | 412 | | * | 413 | | * The inner loop processes 2*VL bytes per iteration. Increasing this | 414 | | * to 4*VL doesn't seem to be helpful here. | 415 | | */ | 416 | 4.97k | while (len) { | 417 | | /* | 418 | | * Calculate the length of the next data chunk such that s1 and | 419 | | * s2 are guaranteed to not exceed UINT32_MAX, and every | 420 | | * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. | 421 | | * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are | 422 | | * used with pmaddwd which does signed multiplication. In the | 423 | | * SSE2 case this limits chunks to 4096 bytes instead of 5536. | 424 | | */ | 425 | 3.93k | size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), | 426 | 3.93k | MAX_CHUNK_LEN) & ~(2*VL - 1)); | 427 | 3.93k | len -= n; | 428 | | | 429 | 3.93k | if (n >= 2*VL) { | 430 | 3.90k | vec_t v_s1 = zeroes; | 431 | 3.90k | vec_t v_s1_sums = zeroes; | 432 | 3.90k | vec_t v_byte_sums_a = zeroes; | 433 | 3.90k | vec_t v_byte_sums_b = zeroes; | 434 | 3.90k | vec_t v_byte_sums_c = zeroes; | 435 | 3.90k | vec_t v_byte_sums_d = zeroes; | 436 | 3.90k | vec_t v_s2; | 437 | | | 438 | 3.90k | s2 += s1 * (n & ~(2*VL - 1)); | 439 | | | 440 | 287k | do { | 441 | 287k | vec_t data_a = VLOADU(p + 0*VL); | 442 | 287k | vec_t data_b = VLOADU(p + 1*VL); | 443 | | | 444 | 287k | v_s1_sums = VADD32(v_s1_sums, v_s1); | 445 | 287k | v_byte_sums_a = VADD16(v_byte_sums_a, | 446 | 287k | VUNPACKLO8(data_a, zeroes)); | 447 | 287k | v_byte_sums_b = VADD16(v_byte_sums_b, | 448 | 287k | VUNPACKHI8(data_a, zeroes)); | 449 | 287k | v_byte_sums_c = VADD16(v_byte_sums_c, | 450 | 287k | VUNPACKLO8(data_b, zeroes)); | 451 | 287k | v_byte_sums_d = VADD16(v_byte_sums_d, | 452 | 287k | VUNPACKHI8(data_b, zeroes)); | 453 | 287k | v_s1 = VADD32(v_s1, | 454 | 287k | VADD32(VSAD8(data_a, zeroes), | 455 | 287k | VSAD8(data_b, zeroes))); | 456 | | /* | 457 | | * Workaround for gcc bug where it generates | 458 | | * unnecessary move instructions | 459 | | * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) | 460 | | */ | 461 | | #if GCC_PREREQ(1, 0) | 462 | | __asm__("" : "+x" (v_s1), "+x" (v_s1_sums), | 463 | | "+x" (v_byte_sums_a), | 464 | | "+x" (v_byte_sums_b), | 465 | | "+x" (v_byte_sums_c), | 466 | | "+x" (v_byte_sums_d)); | 467 | | #endif | 468 | 287k | p += 2*VL; | 469 | 287k | n -= 2*VL; | 470 | 287k | } while (n >= 2*VL); | 471 | | | 472 | | /* | 473 | | * Calculate v_s2 as (2*VL)*v_s1_sums + | 474 | | * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums. | 475 | | * Then update s1 and s2 from v_s1 and v_s2. | 476 | | */ | 477 | 3.90k | v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1), | 478 | 3.90k | VMADD16(v_byte_sums_a, mults_a), | 479 | 3.90k | VMADD16(v_byte_sums_b, mults_b), | 480 | 3.90k | VMADD16(v_byte_sums_c, mults_c), | 481 | 3.90k | VMADD16(v_byte_sums_d, mults_d)); | 482 | 3.90k | reduce_to_32bits(v_s1, v_s2, &s1, &s2); | 483 | 3.90k | } | 484 | | /* | 485 | | * Process the last 0 <= n < 2*VL bytes of the chunk using | 486 | | * scalar instructions and reduce s1 and s2 mod DIVISOR. | 487 | | */ | 488 | 3.93k | ADLER32_CHUNK(s1, s2, p, n); | 489 | 3.93k | } | 490 | 1.04k | #endif /* !USE_VNNI */ | 491 | 1.04k | return (s2 << 16) | s1; | 492 | 1.04k | } |
Unexecuted instantiation: adler32.c:adler32_x86_sse2 |
493 | | |
494 | | #undef vec_t |
495 | | #undef mask_t |
496 | | #undef LOG2_VL |
497 | | #undef VADD8 |
498 | | #undef VADD16 |
499 | | #undef VADD32 |
500 | | #undef VDPBUSD |
501 | | #undef VLOAD |
502 | | #undef VLOADU |
503 | | #undef VMADD16 |
504 | | #undef VMASKZ_LOADU |
505 | | #undef VMULLO32 |
506 | | #undef VSAD8 |
507 | | #undef VSET1_8 |
508 | | #undef VSET1_32 |
509 | | #undef VSETZERO |
510 | | #undef VSLL32 |
511 | | #undef VUNPACKLO8 |
512 | | #undef VUNPACKHI8 |
513 | | |
514 | | #undef SUFFIX |
515 | | #undef ATTRIBUTES |
516 | | #undef VL |
517 | | #undef USE_VNNI |
518 | | #undef USE_AVX512 |