Coverage Report

Created: 2025-12-14 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/adler32_avx2.c
Line
Count
Source
1
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2
 * Copyright (C) 1995-2011 Mark Adler
3
 * Copyright (C) 2022 Adam Stylinski
4
 * Authors:
5
 *   Brian Bockelman <bockelman@gmail.com>
6
 *   Adam Stylinski <kungfujesus06@gmail.com>
7
 * For conditions of distribution and use, see copyright notice in zlib.h
8
 */
9
10
#ifdef X86_AVX2
11
12
#include "zbuild.h"
13
#include <immintrin.h>
14
#include "adler32_p.h"
15
#include "adler32_avx2_p.h"
16
#include "x86_intrins.h"
17
18
extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
19
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
20
21
2.28M
static inline uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
22
2.28M
    if (src == NULL) return 1L;
23
2.28M
    if (len == 0) return adler;
24
25
2.28M
    uint32_t adler0, adler1;
26
2.28M
    adler1 = (adler >> 16) & 0xffff;
27
2.28M
    adler0 = adler & 0xffff;
28
29
2.49M
rem_peel:
30
2.49M
    if (len < 16) {
31
1.78M
        if (COPY) {
32
0
            return adler32_copy_len_16(adler0, src, dst, len, adler1);
33
1.78M
        } else {
34
1.78M
            return adler32_len_16(adler0, src, len, adler1);
35
1.78M
        }
36
1.78M
    } else if (len < 32) {
37
502k
        if (COPY) {
38
0
            return adler32_copy_sse42(adler, dst, src, len);
39
502k
        } else {
40
502k
            return adler32_ssse3(adler, src, len);
41
502k
        }
42
502k
    }
43
44
214k
    __m256i vs1, vs2, vs2_0;
45
46
214k
    const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
47
214k
                                           46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
48
214k
    const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
49
214k
                                             14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
50
214k
    const __m256i dot3v = _mm256_set1_epi16(1);
51
214k
    const __m256i zero = _mm256_setzero_si256();
52
53
435k
    while (len >= 32) {
54
221k
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
55
221k
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
56
221k
        __m256i vs1_0 = vs1;
57
221k
        __m256i vs3 = _mm256_setzero_si256();
58
221k
        vs2_0 = vs3;
59
60
221k
        size_t k = MIN(len, NMAX);
61
221k
        k -= k % 32;
62
221k
        len -= k;
63
64
1.14M
        while (k >= 64) {
65
920k
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
66
920k
            __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
67
920k
            src += 64;
68
920k
            k -= 64;
69
70
920k
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
71
920k
            __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
72
73
920k
            if (COPY) {
74
0
                _mm256_storeu_si256((__m256i*)dst, vbuf);
75
0
                _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
76
0
                dst += 64;
77
0
            }
78
79
920k
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
80
920k
            vs3 = _mm256_add_epi32(vs3, vs1_0);
81
920k
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
82
920k
            __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
83
920k
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
84
920k
            __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
85
920k
            vs1 = _mm256_add_epi32(vs1_sad2, vs1);
86
920k
            vs2 = _mm256_add_epi32(vsum2, vs2);
87
920k
            vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
88
920k
            vs1_0 = vs1;
89
920k
        }
90
91
221k
        vs2 = _mm256_add_epi32(vs2_0, vs2);
92
221k
        vs3 = _mm256_slli_epi32(vs3, 6);
93
221k
        vs2 = _mm256_add_epi32(vs3, vs2);
94
221k
        vs3 = _mm256_setzero_si256();
95
96
426k
        while (k >= 32) {
97
            /*
98
               vs1 = adler + sum(c[i])
99
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
100
            */
101
204k
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
102
204k
            src += 32;
103
204k
            k -= 32;
104
105
204k
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
106
107
204k
            if (COPY) {
108
0
                _mm256_storeu_si256((__m256i*)dst, vbuf);
109
0
                dst += 32;
110
0
            }
111
112
204k
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
113
204k
            vs3 = _mm256_add_epi32(vs3, vs1_0);
114
204k
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
115
204k
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
116
204k
            vs2 = _mm256_add_epi32(vsum2, vs2);
117
204k
            vs1_0 = vs1;
118
204k
        }
119
120
        /* Defer the multiplication with 32 to outside of the loop */
121
221k
        vs3 = _mm256_slli_epi32(vs3, 5);
122
221k
        vs2 = _mm256_add_epi32(vs2, vs3);
123
124
        /* The compiler is generating the following sequence for this integer modulus
125
         * when done the scalar way, in GPRs:
126
127
         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
128
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
129
130
         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
131
         ...
132
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
133
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
134
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
135
         shr    $0x2f,%rsi // shift right by 47
136
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
137
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
138
         ...
139
         // repeats for each element with vpextract instructions
140
141
         This is tricky with AVX2 for a number of reasons:
142
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
143
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
144
                 back down to 32 bit precision later (there is in AVX512)
145
             3.) Full width integer multiplications aren't cheap
146
147
         We can, however, do a relatively cheap sequence for horizontal sums.
148
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
149
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
150
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
151
         performed on the maximum possible inputs before overflow
152
         */
153
154
155
         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
156
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
157
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
158
          * what the compiler is doing to avoid integer divisions. */
159
221k
         adler0 = partial_hsum256(vs1) % BASE;
160
221k
         adler1 = hsum256(vs2) % BASE;
161
221k
    }
162
163
214k
    adler = adler0 | (adler1 << 16);
164
165
214k
    if (len) {
166
210k
        goto rem_peel;
167
210k
    }
168
169
3.64k
    return adler;
170
214k
}
171
172
2.28M
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
173
2.28M
    return adler32_copy_impl(adler, NULL, src, len, 0);
174
2.28M
}
175
176
0
Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
177
0
    return adler32_copy_impl(adler, dst, src, len, 1);
178
0
}
179
180
#endif