/src/zlib-ng/arch/x86/adler32_avx2.c
Line | Count | Source |
1 | | /* adler32_avx2.c -- compute the Adler-32 checksum of a data stream |
2 | | * Copyright (C) 1995-2011 Mark Adler |
3 | | * Copyright (C) 2022 Adam Stylinski |
4 | | * Authors: |
5 | | * Brian Bockelman <bockelman@gmail.com> |
6 | | * Adam Stylinski <kungfujesus06@gmail.com> |
7 | | * For conditions of distribution and use, see copyright notice in zlib.h |
8 | | */ |
9 | | |
10 | | #ifdef X86_AVX2 |
11 | | |
12 | | #include "zbuild.h" |
13 | | #include <immintrin.h> |
14 | | #include "adler32_p.h" |
15 | | #include "adler32_avx2_p.h" |
16 | | #include "x86_intrins.h" |
17 | | |
18 | | extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); |
19 | | extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); |
20 | | |
21 | 0 | Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) { |
22 | 0 | uint32_t adler0, adler1; |
23 | 0 | adler1 = (adler >> 16) & 0xffff; |
24 | 0 | adler0 = adler & 0xffff; |
25 | |
|
26 | 0 | rem_peel: |
27 | 0 | if (len < 16) { |
28 | 0 | return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY); |
29 | 0 | } else if (len < 32) { |
30 | 0 | if (COPY) { |
31 | 0 | return adler32_copy_sse42(adler, dst, src, len); |
32 | 0 | } else { |
33 | 0 | return adler32_ssse3(adler, src, len); |
34 | 0 | } |
35 | 0 | } |
36 | | |
37 | 0 | __m256i vs1, vs2, vs2_0; |
38 | |
|
39 | 0 | const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, |
40 | 0 | 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33); |
41 | 0 | const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, |
42 | 0 | 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); |
43 | 0 | const __m256i dot3v = _mm256_set1_epi16(1); |
44 | 0 | const __m256i zero = _mm256_setzero_si256(); |
45 | |
|
46 | 0 | while (len >= 32) { |
47 | 0 | vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); |
48 | 0 | vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); |
49 | 0 | __m256i vs1_0 = vs1; |
50 | 0 | __m256i vs3 = _mm256_setzero_si256(); |
51 | 0 | vs2_0 = vs3; |
52 | |
|
53 | 0 | size_t k = ALIGN_DOWN(MIN(len, NMAX), 32); |
54 | 0 | len -= k; |
55 | |
|
56 | 0 | while (k >= 64) { |
57 | 0 | __m256i vbuf = _mm256_loadu_si256((__m256i*)src); |
58 | 0 | __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32)); |
59 | 0 | src += 64; |
60 | 0 | k -= 64; |
61 | |
|
62 | 0 | __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); |
63 | 0 | __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero); |
64 | |
|
65 | 0 | if (COPY) { |
66 | 0 | _mm256_storeu_si256((__m256i*)dst, vbuf); |
67 | 0 | _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0); |
68 | 0 | dst += 64; |
69 | 0 | } |
70 | |
|
71 | 0 | vs1 = _mm256_add_epi32(vs1, vs1_sad); |
72 | 0 | vs3 = _mm256_add_epi32(vs3, vs1_0); |
73 | 0 | __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts |
74 | 0 | __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts |
75 | 0 | __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s |
76 | 0 | __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s |
77 | 0 | vs1 = _mm256_add_epi32(vs1_sad2, vs1); |
78 | 0 | vs2 = _mm256_add_epi32(vsum2, vs2); |
79 | 0 | vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0); |
80 | 0 | vs1_0 = vs1; |
81 | 0 | } |
82 | |
|
83 | 0 | vs2 = _mm256_add_epi32(vs2_0, vs2); |
84 | 0 | vs3 = _mm256_slli_epi32(vs3, 6); |
85 | 0 | vs2 = _mm256_add_epi32(vs3, vs2); |
86 | 0 | vs3 = _mm256_setzero_si256(); |
87 | |
|
88 | 0 | while (k >= 32) { |
89 | | /* |
90 | | vs1 = adler + sum(c[i]) |
91 | | vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) |
92 | | */ |
93 | 0 | __m256i vbuf = _mm256_loadu_si256((__m256i*)src); |
94 | 0 | src += 32; |
95 | 0 | k -= 32; |
96 | |
|
97 | 0 | __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's |
98 | |
|
99 | 0 | if (COPY) { |
100 | 0 | _mm256_storeu_si256((__m256i*)dst, vbuf); |
101 | 0 | dst += 32; |
102 | 0 | } |
103 | |
|
104 | 0 | vs1 = _mm256_add_epi32(vs1, vs1_sad); |
105 | 0 | vs3 = _mm256_add_epi32(vs3, vs1_0); |
106 | 0 | __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts |
107 | 0 | __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s |
108 | 0 | vs2 = _mm256_add_epi32(vsum2, vs2); |
109 | 0 | vs1_0 = vs1; |
110 | 0 | } |
111 | | |
112 | | /* Defer the multiplication with 32 to outside of the loop */ |
113 | 0 | vs3 = _mm256_slli_epi32(vs3, 5); |
114 | 0 | vs2 = _mm256_add_epi32(vs2, vs3); |
115 | | |
116 | | /* The compiler is generating the following sequence for this integer modulus |
117 | | * when done the scalar way, in GPRs: |
118 | | |
119 | | adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + |
120 | | (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); |
121 | | |
122 | | mov $0x80078071,%edi // move magic constant into 32 bit register %edi |
123 | | ... |
124 | | vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi |
125 | | mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax |
126 | | imul %rdi,%rsi // do a signed multiplication with magic constant and vector element |
127 | | shr $0x2f,%rsi // shift right by 47 |
128 | | imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 |
129 | | sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above |
130 | | ... |
131 | | // repeats for each element with vpextract instructions |
132 | | |
133 | | This is tricky with AVX2 for a number of reasons: |
134 | | 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there |
135 | | 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate |
136 | | back down to 32 bit precision later (there is in AVX512) |
137 | | 3.) Full width integer multiplications aren't cheap |
138 | | |
139 | | We can, however, do a relatively cheap sequence for horizontal sums. |
140 | | Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was |
141 | | previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but |
142 | | that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be |
143 | | performed on the maximum possible inputs before overflow |
144 | | */ |
145 | | |
146 | | |
147 | | /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy |
148 | | * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). |
149 | | * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly |
150 | | * what the compiler is doing to avoid integer divisions. */ |
151 | 0 | adler0 = partial_hsum256(vs1) % BASE; |
152 | 0 | adler1 = hsum256(vs2) % BASE; |
153 | 0 | } |
154 | |
|
155 | 0 | adler = adler0 | (adler1 << 16); |
156 | |
|
157 | 0 | if (len) { |
158 | 0 | goto rem_peel; |
159 | 0 | } |
160 | | |
161 | 0 | return adler; |
162 | 0 | } |
163 | | |
164 | 0 | Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { |
165 | 0 | return adler32_copy_impl(adler, NULL, src, len, 0); |
166 | 0 | } |
167 | | |
168 | 0 | Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { |
169 | 0 | return adler32_copy_impl(adler, dst, src, len, 1); |
170 | 0 | } |
171 | | |
172 | | #endif |