/src/zlib-ng/arch/x86/adler32_avx2_p.h
Line | Count | Source |
1 | | /* adler32_avx2_p.h -- adler32 avx2 utility functions |
2 | | * Copyright (C) 2022 Adam Stylinski |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | |
6 | | #ifndef ADLER32_AVX2_P_H_ |
7 | | #define ADLER32_AVX2_P_H_ |
8 | | |
9 | | #if defined(X86_AVX2) || defined(X86_AVX512VNNI) |
10 | | |
11 | | /* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ |
12 | 785k | static inline uint32_t hsum256(__m256i x) { |
13 | 785k | __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), |
14 | 785k | _mm256_castsi256_si128(x)); |
15 | 785k | __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); |
16 | 785k | __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); |
17 | 785k | return (uint32_t)_mm_cvtsi128_si32(sum3); |
18 | 785k | } Line | Count | Source | 12 | 785k | static inline uint32_t hsum256(__m256i x) { | 13 | 785k | __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), | 14 | 785k | _mm256_castsi256_si128(x)); | 15 | 785k | __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); | 16 | | __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); | 17 | 785k | return (uint32_t)_mm_cvtsi128_si32(sum3); | 18 | 785k | } |
Unexecuted instantiation: adler32_avx512_vnni.c:hsum256 |
19 | | |
20 | 785k | static inline uint32_t partial_hsum256(__m256i x) { |
21 | | /* We need a permutation vector to extract every other integer. The |
22 | | * rest are going to be zeros */ |
23 | 785k | const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); |
24 | 785k | __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec); |
25 | 785k | __m128i non_zero_sse = _mm256_castsi256_si128(non_zero); |
26 | 785k | __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse)); |
27 | | __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); |
28 | 785k | return (uint32_t)_mm_cvtsi128_si32(sum3); |
29 | 785k | } adler32_avx2.c:partial_hsum256 Line | Count | Source | 20 | 785k | static inline uint32_t partial_hsum256(__m256i x) { | 21 | | /* We need a permutation vector to extract every other integer. The | 22 | | * rest are going to be zeros */ | 23 | 785k | const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); | 24 | 785k | __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec); | 25 | 785k | __m128i non_zero_sse = _mm256_castsi256_si128(non_zero); | 26 | 785k | __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse)); | 27 | | __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); | 28 | 785k | return (uint32_t)_mm_cvtsi128_si32(sum3); | 29 | 785k | } |
Unexecuted instantiation: adler32_avx512_vnni.c:partial_hsum256 |
30 | | #endif |
31 | | |
32 | | #endif |