Coverage Report

Created: 2025-07-18 07:00

/src/zlib-ng/arch/x86/adler32_avx2.c
Line
Count
Source
1
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2
 * Copyright (C) 1995-2011 Mark Adler
3
 * Copyright (C) 2022 Adam Stylinski
4
 * Authors:
5
 *   Brian Bockelman <bockelman@gmail.com>
6
 *   Adam Stylinski <kungfujesus06@gmail.com>
7
 * For conditions of distribution and use, see copyright notice in zlib.h
8
 */
9
10
#ifdef X86_AVX2
11
12
#include "zbuild.h"
13
#include <immintrin.h>
14
#include "adler32_p.h"
15
#include "adler32_avx2_p.h"
16
#include "x86_intrins.h"
17
18
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
19
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
20
21
306M
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
22
306M
    if (src == NULL) return 1L;
23
306M
    if (len == 0) return adler;
24
25
306M
    uint32_t adler0, adler1;
26
306M
    adler1 = (adler >> 16) & 0xffff;
27
306M
    adler0 = adler & 0xffff;
28
29
307M
rem_peel:
30
307M
    if (len < 16) {
31
306M
        if (COPY) {
32
305M
            return adler32_copy_len_16(adler0, src, dst, len, adler1);
33
305M
        } else {
34
1.28M
            return adler32_len_16(adler0, src, len, adler1);
35
1.28M
        }
36
306M
    } else if (len < 32) {
37
435k
        if (COPY) {
38
16.6k
            return adler32_fold_copy_sse42(adler, dst, src, len);
39
419k
        } else {
40
419k
            return adler32_ssse3(adler, src, len);
41
419k
        }
42
435k
    }
43
44
497k
    __m256i vs1, vs2;
45
46
497k
    const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
47
497k
                                           14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
48
497k
    const __m256i dot3v = _mm256_set1_epi16(1);
49
497k
    const __m256i zero = _mm256_setzero_si256();
50
51
1.44M
    while (len >= 32) {
52
947k
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
53
947k
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
54
947k
        __m256i vs1_0 = vs1;
55
947k
        __m256i vs3 = _mm256_setzero_si256();
56
57
947k
        size_t k = MIN(len, NMAX);
58
947k
        k -= k % 32;
59
947k
        len -= k;
60
61
87.9M
        while (k >= 32) {
62
            /*
63
               vs1 = adler + sum(c[i])
64
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
65
            */
66
87.0M
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
67
87.0M
            src += 32;
68
87.0M
            k -= 32;
69
70
87.0M
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
71
72
87.0M
            if (COPY) {
73
47.0M
                _mm256_storeu_si256((__m256i*)dst, vbuf);
74
47.0M
                dst += 32;
75
47.0M
            }
76
 
77
87.0M
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
78
87.0M
            vs3 = _mm256_add_epi32(vs3, vs1_0);
79
87.0M
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
80
87.0M
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
81
87.0M
            vs2 = _mm256_add_epi32(vsum2, vs2);
82
87.0M
            vs1_0 = vs1;
83
87.0M
        }
84
85
        /* Defer the multiplication with 32 to outside of the loop */
86
947k
        vs3 = _mm256_slli_epi32(vs3, 5);
87
947k
        vs2 = _mm256_add_epi32(vs2, vs3);
88
89
        /* The compiler is generating the following sequence for this integer modulus
90
         * when done the scalar way, in GPRs:
91
92
         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
93
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
94
95
         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
96
         ...
97
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
98
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
99
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
100
         shr    $0x2f,%rsi // shift right by 47
101
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
102
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
103
         ...
104
         // repeats for each element with vpextract instructions
105
106
         This is tricky with AVX2 for a number of reasons:
107
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
108
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
109
                 back down to 32 bit precision later (there is in AVX512)
110
             3.) Full width integer multiplications aren't cheap
111
112
         We can, however, do a relatively cheap sequence for horizontal sums.
113
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
114
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
115
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
116
         performed on the maximum possible inputs before overflow
117
         */
118
119
120
         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
121
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
122
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
123
          * what the compiler is doing to avoid integer divisions. */
124
947k
         adler0 = partial_hsum256(vs1) % BASE;
125
947k
         adler1 = hsum256(vs2) % BASE;
126
947k
    }
127
128
497k
    adler = adler0 | (adler1 << 16);
129
130
497k
    if (len) {
131
426k
        goto rem_peel;
132
426k
    }
133
134
70.7k
    return adler;
135
497k
}
136
137
1.71M
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
138
1.71M
    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
139
1.71M
}
140
141
305M
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
142
305M
    return adler32_fold_copy_impl(adler, dst, src, len, 1);
143
305M
}
144
145
#endif