/src/zlib-ng/arch/x86/adler32_avx2.c

Source
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
 * Copyright (C) 1995-2011 Mark Adler
 * Copyright (C) 2022 Adam Stylinski
 * Authors:
 *   Brian Bockelman <bockelman@gmail.com>
 *   Adam Stylinski <kungfujesus06@gmail.com>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#ifdef X86_AVX2

#include "zbuild.h"
#include <immintrin.h>
#include "adler32_p.h"
#include "adler32_avx2_p.h"
#include "x86_intrins.h"

extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);

Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
    uint32_t adler0, adler1;
    adler1 = (adler >> 16) & 0xffff;
    adler0 = adler & 0xffff;

rem_peel:
    if (len < 16) {
        return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
    } else if (len < 32) {
        if (COPY) {
            return adler32_copy_sse42(adler, dst, src, len);
        } else {
            return adler32_ssse3(adler, src, len);
        }
    }

    __m256i vs1, vs2, vs2_0;

    const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
                                           46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
    const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
                                             14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
    const __m256i dot3v = _mm256_set1_epi16(1);
    const __m256i zero = _mm256_setzero_si256();

    while (len >= 32) {
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
        __m256i vs1_0 = vs1;
        __m256i vs3 = _mm256_setzero_si256();
        vs2_0 = vs3;

        size_t k = MIN(len, NMAX);
        k -= k % 32;
        len -= k;

        while (k >= 64) {
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
            __m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
            src += 64;
            k -= 64;

            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
            __m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);

            if (COPY) {
                _mm256_storeu_si256((__m256i*)dst, vbuf);
                _mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
                dst += 64;
            }

            vs1 = _mm256_add_epi32(vs1, vs1_sad);
            vs3 = _mm256_add_epi32(vs3, vs1_0);
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
            __m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
            __m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
            vs1 = _mm256_add_epi32(vs1_sad2, vs1);
            vs2 = _mm256_add_epi32(vsum2, vs2);
            vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
            vs1_0 = vs1;
        }

        vs2 = _mm256_add_epi32(vs2_0, vs2);
        vs3 = _mm256_slli_epi32(vs3, 6);
        vs2 = _mm256_add_epi32(vs3, vs2);
        vs3 = _mm256_setzero_si256();

        while (k >= 32) {
            /*
               vs1 = adler + sum(c[i])
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
            */
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
            src += 32;
            k -= 32;

            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's

            if (COPY) {
                _mm256_storeu_si256((__m256i*)dst, vbuf);
                dst += 32;
            }

            vs1 = _mm256_add_epi32(vs1, vs1_sad);
            vs3 = _mm256_add_epi32(vs3, vs1_0);
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
            vs2 = _mm256_add_epi32(vsum2, vs2);
            vs1_0 = vs1;
        }

        /* Defer the multiplication with 32 to outside of the loop */
        vs3 = _mm256_slli_epi32(vs3, 5);
        vs2 = _mm256_add_epi32(vs2, vs3);

        /* The compiler is generating the following sequence for this integer modulus
         * when done the scalar way, in GPRs:

         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);

         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
         ...
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
         shr    $0x2f,%rsi // shift right by 47
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
         ...
         // repeats for each element with vpextract instructions

         This is tricky with AVX2 for a number of reasons:
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
                 back down to 32 bit precision later (there is in AVX512)
             3.) Full width integer multiplications aren't cheap

         We can, however, do a relatively cheap sequence for horizontal sums.
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
         performed on the maximum possible inputs before overflow
         */


         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
          * what the compiler is doing to avoid integer divisions. */
         adler0 = partial_hsum256(vs1) % BASE;
         adler1 = hsum256(vs2) % BASE;
    }

    adler = adler0 | (adler1 << 16);

    if (len) {
        goto rem_peel;
    }

    return adler;
}

Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
    return adler32_copy_impl(adler, NULL, src, len, 0);
}

Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
    return adler32_copy_impl(adler, dst, src, len, 1);
}

#endif

Line	Count	Source
1		/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2		* Copyright (C) 1995-2011 Mark Adler
3		* Copyright (C) 2022 Adam Stylinski
4		* Authors:
5		* Brian Bockelman <bockelman@gmail.com>
6		* Adam Stylinski <kungfujesus06@gmail.com>
7		* For conditions of distribution and use, see copyright notice in zlib.h
8		*/
9
10		#ifdef X86_AVX2
11
12		#include "zbuild.h"
13		#include <immintrin.h>
14		#include "adler32_p.h"
15		#include "adler32_avx2_p.h"
16		#include "x86_intrins.h"
17
18		extern uint32_t adler32_copy_sse42(uint32_t adler, uint8_t dst, const uint8_t src, size_t len);
19		extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
20
21	314M	Z_FORCEINLINE static uint32_t adler32_copy_impl(uint32_t adler, uint8_t dst, const uint8_t src, size_t len, const int COPY) {
22	314M	uint32_t adler0, adler1;
23	314M	adler1 = (adler >> 16) & 0xffff;
24	314M	adler0 = adler & 0xffff;
25
26	314M	rem_peel:
27	314M	if (len < 16) {
28	313M	return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
29	313M	} else if (len < 32) {
30	712k	if (COPY) {
31	19.3k	return adler32_copy_sse42(adler, dst, src, len);
32	692k	} else {
33	692k	return adler32_ssse3(adler, src, len);
34	692k	}
35	712k	}
36
37	363k	__m256i vs1, vs2, vs2_0;
38
39	363k	const __m256i dot2v = _mm256_setr_epi8(64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47,
40	363k	46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33);
41	363k	const __m256i dot2v_0 = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
42	363k	14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
43	363k	const __m256i dot3v = _mm256_set1_epi16(1);
44	363k	const __m256i zero = _mm256_setzero_si256();
45
46	1.19M	while (len >= 32) {
47	836k	vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
48	836k	vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
49	836k	__m256i vs1_0 = vs1;
50	836k	__m256i vs3 = _mm256_setzero_si256();
51	836k	vs2_0 = vs3;
52
53	836k	size_t k = MIN(len, NMAX);
54	836k	k -= k % 32;
55	836k	len -= k;
56
57	46.1M	while (k >= 64) {
58	45.3M	__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
59	45.3M	__m256i vbuf_0 = _mm256_loadu_si256((__m256i*)(src + 32));
60	45.3M	src += 64;
61	45.3M	k -= 64;
62
63	45.3M	__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero);
64	45.3M	__m256i vs1_sad2 = _mm256_sad_epu8(vbuf_0, zero);
65
66	45.3M	if (COPY) {
67	24.4M	_mm256_storeu_si256((__m256i*)dst, vbuf);
68	24.4M	_mm256_storeu_si256((__m256i*)(dst + 32), vbuf_0);
69	24.4M	dst += 64;
70	24.4M	}
71
72	45.3M	vs1 = _mm256_add_epi32(vs1, vs1_sad);
73	45.3M	vs3 = _mm256_add_epi32(vs3, vs1_0);
74	45.3M	__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
75	45.3M	__m256i v_short_sum2_0 = _mm256_maddubs_epi16(vbuf_0, dot2v_0); // sum 32 uint8s to 16 shorts
76	45.3M	__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
77	45.3M	__m256i vsum2_0 = _mm256_madd_epi16(v_short_sum2_0, dot3v); // sum 16 shorts to 8 uint32s
78	45.3M	vs1 = _mm256_add_epi32(vs1_sad2, vs1);
79	45.3M	vs2 = _mm256_add_epi32(vsum2, vs2);
80	45.3M	vs2_0 = _mm256_add_epi32(vsum2_0, vs2_0);
81	45.3M	vs1_0 = vs1;
82	45.3M	}
83
84	836k	vs2 = _mm256_add_epi32(vs2_0, vs2);
85	836k	vs3 = _mm256_slli_epi32(vs3, 6);
86	836k	vs2 = _mm256_add_epi32(vs3, vs2);
87	836k	vs3 = _mm256_setzero_si256();
88
89	1.60M	while (k >= 32) {
90		/*
91		vs1 = adler + sum(c[i])
92		vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
93		*/
94	767k	__m256i vbuf = _mm256_loadu_si256((__m256i*)src);
95	767k	src += 32;
96	767k	k -= 32;
97
98	767k	__m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
99
100	767k	if (COPY) {
101	294k	_mm256_storeu_si256((__m256i*)dst, vbuf);
102	294k	dst += 32;
103	294k	}
104
105	767k	vs1 = _mm256_add_epi32(vs1, vs1_sad);
106	767k	vs3 = _mm256_add_epi32(vs3, vs1_0);
107	767k	__m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v_0); // sum 32 uint8s to 16 shorts
108	767k	__m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
109	767k	vs2 = _mm256_add_epi32(vsum2, vs2);
110	767k	vs1_0 = vs1;
111	767k	}
112
113		/* Defer the multiplication with 32 to outside of the loop */
114	836k	vs3 = _mm256_slli_epi32(vs3, 5);
115	836k	vs2 = _mm256_add_epi32(vs2, vs3);
116
117		/* The compiler is generating the following sequence for this integer modulus
118		* when done the scalar way, in GPRs:
119
120		adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
121		(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
122
123		mov $0x80078071,%edi // move magic constant into 32 bit register %edi
124		...
125		vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
126		mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
127		imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
128		shr $0x2f,%rsi // shift right by 47
129		imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
130		sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
131		...
132		// repeats for each element with vpextract instructions
133
134		This is tricky with AVX2 for a number of reasons:
135		1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
136		2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
137		back down to 32 bit precision later (there is in AVX512)
138		3.) Full width integer multiplications aren't cheap
139
140		We can, however, do a relatively cheap sequence for horizontal sums.
141		Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
142		previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
143		that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
144		performed on the maximum possible inputs before overflow
145		*/
146
147
148		/* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
149		* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
150		* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
151		* what the compiler is doing to avoid integer divisions. */
152	836k	adler0 = partial_hsum256(vs1) % BASE;
153	836k	adler1 = hsum256(vs2) % BASE;
154	836k	}
155
156	363k	adler = adler0 \| (adler1 << 16);
157
158	363k	if (len) {
159	300k	goto rem_peel;
160	300k	}
161
162	62.1k	return adler;
163	363k	}
164
165	2.76M	Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
166	2.76M	return adler32_copy_impl(adler, NULL, src, len, 0);
167	2.76M	}
168
169	311M	Z_INTERNAL uint32_t adler32_copy_avx2(uint32_t adler, uint8_t dst, const uint8_t src, size_t len) {
170	311M	return adler32_copy_impl(adler, dst, src, len, 1);
171	311M	}
172
173		#endif

Coverage Report

Created: 2026-01-17 06:26