Coverage Report

Created: 2026-01-17 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/adler32_avx2.c
Line
Count
Source
1
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2
 * Copyright (C) 1995-2011 Mark Adler
3
 * Copyright (C) 2022 Adam Stylinski
4
 * Authors:
5
 *   Brian Bockelman <bockelman@gmail.com>
6
 *   Adam Stylinski <kungfujesus06@gmail.com>
7
 * For conditions of distribution and use, see copyright notice in zlib.h
8
 */
9
10
#ifdef X86_AVX2
11
12
#include "zbuild.h"
13
#include <immintrin.h>
14
#include "adler32_p.h"
15
#include "adler32_avx2_p.h"
16
#include "x86_intrins.h"
17
18
extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
19
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
20
21
314M
Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
22
314M
    uint32_t adler0, adler1;
23
314M
    adler1 = (adler >> 16) & 0xffff;
24
314M
    adler0 = adler & 0xffff;
25
26
314M
rem_peel:
27
314M
    if (len < 16) {
28
313M
        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
29
313M
    } else if (len < 32) {
30
712k
        if (COPY) {
31
19.3k
            return adler32_copy_sse42(adler, dst, src, len);
32
692k
        } else {
33
692k
            return adler32_ssse3(adler, src, len);
34
692k
        }
35
712k
    }
36
37
363k
    __m256i vs1, vs2, vs2_0;
38
39
363k
    const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
40
363k
                                           46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
41
363k
    const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
42
363k
                                             14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
43
363k
    const __m256i dot3v = _mm256_set1_epi16(1);
44
363k
    const __m256i zero = _mm256_setzero_si256();
45
46
1.19M
    while (len >= 32) {
47
836k
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
48
836k
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
49
836k
        __m256i vs1_0 = vs1;
50
836k
        __m256i vs3 = _mm256_setzero_si256();
51
836k
        vs2_0 = vs3;
52
53
836k
        size_t k = MIN(len, NMAX);
54
836k
        k -= k % 32;
55
836k
        len -= k;
56
57
46.1M
        while (k >= 64) {
58
45.3M
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
59
45.3M
            __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
60
45.3M
            src += 64;
61
45.3M
            k -= 64;
62
63
45.3M
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
64
45.3M
            __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
65
66
45.3M
            if (COPY) {
67
24.4M
                _mm256_storeu_si256((__m256i*)dst, vbuf);
68
24.4M
                _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
69
24.4M
                dst += 64;
70
24.4M
            }
71
72
45.3M
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
73
45.3M
            vs3 = _mm256_add_epi32(vs3, vs1_0);
74
45.3M
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
75
45.3M
            __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
76
45.3M
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
77
45.3M
            __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
78
45.3M
            vs1 = _mm256_add_epi32(vs1_sad2, vs1);
79
45.3M
            vs2 = _mm256_add_epi32(vsum2, vs2);
80
45.3M
            vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
81
45.3M
            vs1_0 = vs1;
82
45.3M
        }
83
84
836k
        vs2 = _mm256_add_epi32(vs2_0, vs2);
85
836k
        vs3 = _mm256_slli_epi32(vs3, 6);
86
836k
        vs2 = _mm256_add_epi32(vs3, vs2);
87
836k
        vs3 = _mm256_setzero_si256();
88
89
1.60M
        while (k >= 32) {
90
            /*
91
               vs1 = adler + sum(c[i])
92
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
93
            */
94
767k
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
95
767k
            src += 32;
96
767k
            k -= 32;
97
98
767k
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
99
100
767k
            if (COPY) {
101
294k
                _mm256_storeu_si256((__m256i*)dst, vbuf);
102
294k
                dst += 32;
103
294k
            }
104
105
767k
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
106
767k
            vs3 = _mm256_add_epi32(vs3, vs1_0);
107
767k
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
108
767k
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
109
767k
            vs2 = _mm256_add_epi32(vsum2, vs2);
110
767k
            vs1_0 = vs1;
111
767k
        }
112
113
        /* Defer the multiplication with 32 to outside of the loop */
114
836k
        vs3 = _mm256_slli_epi32(vs3, 5);
115
836k
        vs2 = _mm256_add_epi32(vs2, vs3);
116
117
        /* The compiler is generating the following sequence for this integer modulus
118
         * when done the scalar way, in GPRs:
119
120
         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
121
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
122
123
         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
124
         ...
125
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
126
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
127
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
128
         shr    $0x2f,%rsi // shift right by 47
129
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
130
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
131
         ...
132
         // repeats for each element with vpextract instructions
133
134
         This is tricky with AVX2 for a number of reasons:
135
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
136
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
137
                 back down to 32 bit precision later (there is in AVX512)
138
             3.) Full width integer multiplications aren't cheap
139
140
         We can, however, do a relatively cheap sequence for horizontal sums.
141
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
142
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
143
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
144
         performed on the maximum possible inputs before overflow
145
         */
146
147
148
         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
149
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
150
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
151
          * what the compiler is doing to avoid integer divisions. */
152
836k
         adler0 = partial_hsum256(vs1) % BASE;
153
836k
         adler1 = hsum256(vs2) % BASE;
154
836k
    }
155
156
363k
    adler = adler0 | (adler1 << 16);
157
158
363k
    if (len) {
159
300k
        goto rem_peel;
160
300k
    }
161
162
62.1k
    return adler;
163
363k
}
164
165
2.76M
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
166
2.76M
    return adler32_copy_impl(adler, NULL, src, len, 0);
167
2.76M
}
168
169
311M
Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
170
311M
    return adler32_copy_impl(adler, dst, src, len, 1);
171
311M
}
172
173
#endif