/src/zlib-ng/arch/x86/adler32_avx2.c

Source (jump to first uncovered line)
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
 * Copyright (C) 1995-2011 Mark Adler
 * Copyright (C) 2022 Adam Stylinski
 * Authors:
 *   Brian Bockelman <bockelman@gmail.com>
 *   Adam Stylinski <kungfujesus06@gmail.com>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#ifdef X86_AVX2

#include "zbuild.h"
#include <immintrin.h>
#include "adler32_p.h"
#include "adler32_avx2_p.h"
#include "x86_intrins.h"

extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);

static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
    if (src == NULL) return 1L;
    if (len == 0) return adler;

    uint32_t adler0, adler1;
    adler1 = (adler >> 16) & 0xffff;
    adler0 = adler & 0xffff;

rem_peel:
    if (len < 16) {
        if (COPY) {
            return adler32_copy_len_16(adler0, src, dst, len, adler1);
        } else {
            return adler32_len_16(adler0, src, len, adler1);
        }
    } else if (len < 32) {
        if (COPY) {
            return adler32_fold_copy_sse42(adler, dst, src, len);
        } else {
            return adler32_ssse3(adler, src, len);
        }
    }

    __m256i vs1, vs2, vs2_0;

    const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
                                           46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
    const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
                                             14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
    const __m256i dot3v = _mm256_set1_epi16(1);
    const __m256i zero = _mm256_setzero_si256();

    while (len >= 32) {
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
        __m256i vs1_0 = vs1;
        __m256i vs3 = _mm256_setzero_si256();
        vs2_0 = vs3;

        size_t k = MIN(len, NMAX);
        k -= k % 32;
        len -= k;

        while (k >= 64) {
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
            __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
            src += 64;
            k -= 64;

            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
            __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);

            if (COPY) {
                _mm256_storeu_si256((__m256i*)dst, vbuf);
                _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
                dst += 64;
            }

            vs1 = _mm256_add_epi32(vs1, vs1_sad);
            vs3 = _mm256_add_epi32(vs3, vs1_0);
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
            __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
            __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
            vs1 = _mm256_add_epi32(vs1_sad2, vs1);
            vs2 = _mm256_add_epi32(vsum2, vs2);
            vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
            vs1_0 = vs1;
        }

        vs2 = _mm256_add_epi32(vs2_0, vs2);
        vs3 = _mm256_slli_epi32(vs3, 6);
        vs2 = _mm256_add_epi32(vs3, vs2);
        vs3 = _mm256_setzero_si256();

        while (k >= 32) {
            /*
               vs1 = adler + sum(c[i])
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
            */
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
            src += 32;
            k -= 32;

            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's

            if (COPY) {
                _mm256_storeu_si256((__m256i*)dst, vbuf);
                dst += 32;
            }
 
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
            vs3 = _mm256_add_epi32(vs3, vs1_0);
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
            vs2 = _mm256_add_epi32(vsum2, vs2);
            vs1_0 = vs1;
        }

        /* Defer the multiplication with 32 to outside of the loop */
        vs3 = _mm256_slli_epi32(vs3, 5);
        vs2 = _mm256_add_epi32(vs2, vs3);

        /* The compiler is generating the following sequence for this integer modulus
         * when done the scalar way, in GPRs:

         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);

         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
         ...
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
         shr    $0x2f,%rsi // shift right by 47
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
         ...
         // repeats for each element with vpextract instructions

         This is tricky with AVX2 for a number of reasons:
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
                 back down to 32 bit precision later (there is in AVX512)
             3.) Full width integer multiplications aren't cheap

         We can, however, do a relatively cheap sequence for horizontal sums.
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
         performed on the maximum possible inputs before overflow
         */


         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
          * what the compiler is doing to avoid integer divisions. */
         adler0 = partial_hsum256(vs1) % BASE;
         adler1 = hsum256(vs2) % BASE;
    }

    adler = adler0 | (adler1 << 16);

    if (len) {
        goto rem_peel;
    }

    return adler;
}

Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
}

Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
    return adler32_fold_copy_impl(adler, dst, src, len, 1);
}

#endif

Coverage Report

Created: 2025-08-26 06:45

Line	Count	Source (jump to first uncovered line)
1		/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2		* Copyright (C) 1995-2011 Mark Adler
3		* Copyright (C) 2022 Adam Stylinski
4		* Authors:
5		* Brian Bockelman <bockelman@gmail.com>
6		* Adam Stylinski <kungfujesus06@gmail.com>
7		* For conditions of distribution and use, see copyright notice in zlib.h
8		*/
9
10		#ifdef X86_AVX2
11
12		#include "zbuild.h"
13		#include <immintrin.h>
14		#include "adler32_p.h"
15		#include "adler32_avx2_p.h"
16		#include "x86_intrins.h"
17
18		extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t dst, const uint8_t src, size_t len);
19		extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
20
21	1.79M	static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t dst, const uint8_t src, size_t len, const int COPY) {
22	1.79M	if (src == NULL) return 1L;
23	1.79M	if (len == 0) return adler;
24
25	1.79M	uint32_t adler0, adler1;
26	1.79M	adler1 = (adler >> 16) & 0xffff;
27	1.79M	adler0 = adler & 0xffff;
28
29	2.11M	rem_peel:
30	2.11M	if (len < 16) {
31	1.17M	if (COPY) {
32	0	return adler32_copy_len_16(adler0, src, dst, len, adler1);
33	1.17M	} else {
34	1.17M	return adler32_len_16(adler0, src, len, adler1);
35	1.17M	}
36	1.17M	} else if (len < 32) {
37	592k	if (COPY) {
38	0	return adler32_fold_copy_sse42(adler, dst, src, len);
39	592k	} else {
40	592k	return adler32_ssse3(adler, src, len);
41	592k	}
42	592k	}
43
44	353k	__m256i vs1, vs2, vs2_0;
45
46	353k	const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
47	353k	46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
48	353k	const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
49	353k	14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
50	353k	const __m256i dot3v = _mm256_set1_epi16(1);
51	353k	const __m256i zero = _mm256_setzero_si256();
52
53	713k	while (len >= 32) {
54	360k	vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
55	360k	vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
56	360k	__m256i vs1_0 = vs1;
57	360k	__m256i vs3 = _mm256_setzero_si256();
58	360k	vs2_0 = vs3;
59
60	360k	size_t k = MIN(len, NMAX);
61	360k	k -= k % 32;
62	360k	len -= k;
63
64	1.24M	while (k >= 64) {
65	881k	__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
66	881k	__m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
67	881k	src += 64;
68	881k	k -= 64;
69
70	881k	__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
71	881k	__m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
72
73	881k	if (COPY) {
74	0	_mm256_storeu_si256((__m256i*)dst, vbuf);
75	0	_mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
76	0	dst += 64;
77	0	}
78
79	881k	vs1 = _mm256_add_epi32(vs1, vs1_sad);
80	881k	vs3 = _mm256_add_epi32(vs3, vs1_0);
81	881k	__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
82	881k	__m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
83	881k	__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
84	881k	__m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
85	881k	vs1 = _mm256_add_epi32(vs1_sad2, vs1);
86	881k	vs2 = _mm256_add_epi32(vsum2, vs2);
87	881k	vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
88	881k	vs1_0 = vs1;
89	881k	}
90
91	360k	vs2 = _mm256_add_epi32(vs2_0, vs2);
92	360k	vs3 = _mm256_slli_epi32(vs3, 6);
93	360k	vs2 = _mm256_add_epi32(vs3, vs2);
94	360k	vs3 = _mm256_setzero_si256();
95
96	669k	while (k >= 32) {
97		/*
98		vs1 = adler + sum(c[i])
99		vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
100		*/
101	308k	__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
102	308k	src += 32;
103	308k	k -= 32;
104
105	308k	__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
106
107	308k	if (COPY) {
108	0	_mm256_storeu_si256((__m256i*)dst, vbuf);
109	0	dst += 32;
110	0	}
111
112	308k	vs1 = _mm256_add_epi32(vs1, vs1_sad);
113	308k	vs3 = _mm256_add_epi32(vs3, vs1_0);
114	308k	__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
115	308k	__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
116	308k	vs2 = _mm256_add_epi32(vsum2, vs2);
117	308k	vs1_0 = vs1;
118	308k	}
119
120		/* Defer the multiplication with 32 to outside of the loop */
121	360k	vs3 = _mm256_slli_epi32(vs3, 5);
122	360k	vs2 = _mm256_add_epi32(vs2, vs3);
123
124		/* The compiler is generating the following sequence for this integer modulus
125		* when done the scalar way, in GPRs:
126
127		adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
128		(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
129
130		mov $0x80078071,%edi // move magic constant into 32 bit register %edi
131		...
132		vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
133		mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
134		imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
135		shr $0x2f,%rsi // shift right by 47
136		imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
137		sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
138		...
139		// repeats for each element with vpextract instructions
140
141		This is tricky with AVX2 for a number of reasons:
142		1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
143		2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
144		back down to 32 bit precision later (there is in AVX512)
145		3.) Full width integer multiplications aren't cheap
146
147		We can, however, do a relatively cheap sequence for horizontal sums.
148		Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
149		previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
150		that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
151		performed on the maximum possible inputs before overflow
152		*/
153
154
155		/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
156		* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
157		* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
158		* what the compiler is doing to avoid integer divisions. */
159	360k	adler0 = partial_hsum256(vs1) % BASE;
160	360k	adler1 = hsum256(vs2) % BASE;
161	360k	}
162
163	353k	adler = adler0 \| (adler1 << 16);
164
165	353k	if (len) {
166	324k	goto rem_peel;
167	324k	}
168
169	28.7k	return adler;
170	353k	}
171
172	1.79M	Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
173	1.79M	return adler32_fold_copy_impl(adler, NULL, src, len, 0);
174	1.79M	}
175
176	0	Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t dst, const uint8_t src, size_t len) {
177	0	return adler32_fold_copy_impl(adler, dst, src, len, 1);
178	0	}
179
180		#endif