Coverage Report

Created: 2026-02-26 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/adler32_avx2.c
Line
Count
Source
1
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2
 * Copyright (C) 1995-2011 Mark Adler
3
 * Copyright (C) 2022 Adam Stylinski
4
 * Authors:
5
 *   Brian Bockelman <bockelman@gmail.com>
6
 *   Adam Stylinski <kungfujesus06@gmail.com>
7
 * For conditions of distribution and use, see copyright notice in zlib.h
8
 */
9
10
#ifdef X86_AVX2
11
12
#include "zbuild.h"
13
#include <immintrin.h>
14
#include "adler32_p.h"
15
#include "adler32_avx2_p.h"
16
#include "x86_intrins.h"
17
18
extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
19
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
20
21
2.56M
Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
22
2.56M
    uint32_t adler0, adler1;
23
2.56M
    adler1 = (adler >> 16) & 0xffff;
24
2.56M
    adler0 = adler & 0xffff;
25
26
2.78M
rem_peel:
27
2.78M
    if (len < 16) {
28
2.10M
        return adler32_copy_tail(adler0, dst, src, len, adler1, 1, 15, COPY);
29
2.10M
    } else if (len < 32) {
30
434k
        if (COPY) {
31
0
            return adler32_copy_sse42(adler, dst, src, len);
32
434k
        } else {
33
434k
            return adler32_ssse3(adler, src, len);
34
434k
        }
35
434k
    }
36
37
248k
    __m256i vs1, vs2, vs2_0;
38
39
248k
    const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
40
248k
                                           46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
41
248k
    const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
42
248k
                                             14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
43
248k
    const __m256i dot3v = _mm256_set1_epi16(1);
44
248k
    const __m256i zero = _mm256_setzero_si256();
45
46
505k
    while (len >= 32) {
47
256k
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
48
256k
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
49
256k
        __m256i vs1_0 = vs1;
50
256k
        __m256i vs3 = _mm256_setzero_si256();
51
256k
        vs2_0 = vs3;
52
53
256k
        size_t k = ALIGN_DOWN(MIN(len, NMAX), 32);
54
256k
        len -= k;
55
56
1.30M
        while (k >= 64) {
57
1.05M
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
58
1.05M
            __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
59
1.05M
            src += 64;
60
1.05M
            k -= 64;
61
62
1.05M
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
63
1.05M
            __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
64
65
1.05M
            if (COPY) {
66
0
                _mm256_storeu_si256((__m256i*)dst, vbuf);
67
0
                _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
68
0
                dst += 64;
69
0
            }
70
71
1.05M
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
72
1.05M
            vs3 = _mm256_add_epi32(vs3, vs1_0);
73
1.05M
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
74
1.05M
            __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
75
1.05M
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
76
1.05M
            __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
77
1.05M
            vs1 = _mm256_add_epi32(vs1_sad2, vs1);
78
1.05M
            vs2 = _mm256_add_epi32(vsum2, vs2);
79
1.05M
            vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
80
1.05M
            vs1_0 = vs1;
81
1.05M
        }
82
83
256k
        vs2 = _mm256_add_epi32(vs2_0, vs2);
84
256k
        vs3 = _mm256_slli_epi32(vs3, 6);
85
256k
        vs2 = _mm256_add_epi32(vs3, vs2);
86
256k
        vs3 = _mm256_setzero_si256();
87
88
497k
        while (k >= 32) {
89
            /*
90
               vs1 = adler + sum(c[i])
91
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
92
            */
93
240k
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
94
240k
            src += 32;
95
240k
            k -= 32;
96
97
240k
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
98
99
240k
            if (COPY) {
100
0
                _mm256_storeu_si256((__m256i*)dst, vbuf);
101
0
                dst += 32;
102
0
            }
103
104
240k
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
105
240k
            vs3 = _mm256_add_epi32(vs3, vs1_0);
106
240k
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
107
240k
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
108
240k
            vs2 = _mm256_add_epi32(vsum2, vs2);
109
240k
            vs1_0 = vs1;
110
240k
        }
111
112
        /* Defer the multiplication with 32 to outside of the loop */
113
256k
        vs3 = _mm256_slli_epi32(vs3, 5);
114
256k
        vs2 = _mm256_add_epi32(vs2, vs3);
115
116
        /* The compiler is generating the following sequence for this integer modulus
117
         * when done the scalar way, in GPRs:
118
119
         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
120
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
121
122
         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
123
         ...
124
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
125
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
126
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
127
         shr    $0x2f,%rsi // shift right by 47
128
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
129
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
130
         ...
131
         // repeats for each element with vpextract instructions
132
133
         This is tricky with AVX2 for a number of reasons:
134
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
135
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
136
                 back down to 32 bit precision later (there is in AVX512)
137
             3.) Full width integer multiplications aren't cheap
138
139
         We can, however, do a relatively cheap sequence for horizontal sums.
140
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
141
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
142
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
143
         performed on the maximum possible inputs before overflow
144
         */
145
146
147
         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
148
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
149
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
150
          * what the compiler is doing to avoid integer divisions. */
151
256k
         adler0 = partial_hsum256(vs1) % BASE;
152
256k
         adler1 = hsum256(vs2) % BASE;
153
256k
    }
154
155
248k
    adler = adler0 | (adler1 << 16);
156
157
248k
    if (len) {
158
221k
        goto rem_peel;
159
221k
    }
160
161
27.5k
    return adler;
162
248k
}
163
164
2.56M
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
165
2.56M
    return adler32_copy_impl(adler, NULL, src, len, 0);
166
2.56M
}
167
168
0
Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
169
0
    return adler32_copy_impl(adler, dst, src, len, 1);
170
0
}
171
172
#endif