Coverage Report

Created: 2026-01-17 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/arch/x86/adler32_ssse3.c
Line
Count
Source
1
/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
2
 * Copyright (C) 1995-2011 Mark Adler
3
 * Authors:
4
 *   Adam Stylinski <kungfujesus06@gmail.com>
5
 *   Brian Bockelman <bockelman@gmail.com>
6
 * For conditions of distribution and use, see copyright notice in zlib.h
7
 */
8
9
#include "zbuild.h"
10
#include "adler32_p.h"
11
#include "adler32_ssse3_p.h"
12
13
#ifdef X86_SSSE3
14
15
#include <immintrin.h>
16
17
669k
Z_FORCEINLINE static uint32_t adler32_impl(uint32_t adler, const uint8_t *buf, size_t len) {
18
669k
    uint32_t sum2;
19
20
     /* split Adler-32 into component sums */
21
669k
    sum2 = (adler >> 16) & 0xffff;
22
669k
    adler &= 0xffff;
23
24
    /* in case user likes doing a byte at a time, keep it fast */
25
669k
    if (UNLIKELY(len == 1))
26
0
        return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
27
28
    /* in case short lengths are provided, keep it somewhat fast */
29
669k
    if (UNLIKELY(len < 16))
30
0
        return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
31
32
669k
    const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
33
669k
    const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
34
669k
    const __m128i dot3v = _mm_set1_epi16(1);
35
669k
    const __m128i zero = _mm_setzero_si128();
36
37
669k
    __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
38
669k
            vbuf_0, v_sad_sum2, vsum2, vsum2_0;
39
40
    /* If our buffer is unaligned (likely), make the determination whether
41
     * or not there's enough of a buffer to consume to make the scalar, aligning
42
     * additions worthwhile or if it's worth it to just eat the cost of an unaligned
43
     * load. This is a pretty simple test, just test if 16 - the remainder + len is
44
     * < 16 */
45
669k
    size_t max_iters = NMAX;
46
669k
    size_t rem = (uintptr_t)buf & 15;
47
669k
    size_t align_offset = 16 - rem;
48
669k
    size_t k = 0;
49
669k
    if (rem) {
50
395k
        if (len < 16 + align_offset) {
51
            /* Let's eat the cost of this one unaligned load so that
52
             * we don't completely skip over the vectorization. Doing
53
             * 16 bytes at a time unaligned is better than 16 + <= 15
54
             * sums */
55
10.1k
            vbuf = _mm_loadu_si128((__m128i*)buf);
56
10.1k
            len -= 16;
57
10.1k
            buf += 16;
58
10.1k
            vs1 = _mm_cvtsi32_si128(adler);
59
10.1k
            vs2 = _mm_cvtsi32_si128(sum2);
60
10.1k
            vs3 = _mm_setzero_si128();
61
10.1k
            vs1_0 = vs1;
62
10.1k
            goto unaligned_jmp;
63
10.1k
        }
64
65
3.45M
        for (size_t i = 0; i < align_offset; ++i) {
66
3.06M
            adler += *(buf++);
67
3.06M
            sum2 += adler;
68
3.06M
        }
69
70
        /* lop off the max number of sums based on the scalar sums done
71
         * above */
72
385k
        len -= align_offset;
73
385k
        max_iters -= align_offset;
74
385k
    }
75
76
77
1.32M
    while (len >= 16) {
78
659k
        vs1 = _mm_cvtsi32_si128(adler);
79
659k
        vs2 = _mm_cvtsi32_si128(sum2);
80
659k
        vs3 = _mm_setzero_si128();
81
659k
        vs2_0 = _mm_setzero_si128();
82
659k
        vs1_0 = vs1;
83
84
659k
        k = (len < max_iters ? len : max_iters);
85
659k
        k -= k % 16;
86
659k
        len -= k;
87
88
659k
        while (k >= 32) {
89
            /*
90
               vs1 = adler + sum(c[i])
91
               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
92
            */
93
0
            vbuf = _mm_load_si128((__m128i*)buf);
94
0
            vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
95
0
            buf += 32;
96
0
            k -= 32;
97
98
0
            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
99
0
            v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
100
0
            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
101
0
            vs3 = _mm_add_epi32(vs1_0, vs3);
102
103
0
            vs1 = _mm_add_epi32(v_sad_sum2, vs1);
104
0
            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
105
0
            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
106
0
            v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
107
0
            vs2 = _mm_add_epi32(vsum2, vs2);
108
0
            vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
109
0
            vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
110
0
            vs1_0 = vs1;
111
0
        }
112
113
659k
        vs2 = _mm_add_epi32(vs2_0, vs2);
114
659k
        vs3 = _mm_slli_epi32(vs3, 5);
115
659k
        vs2 = _mm_add_epi32(vs3, vs2);
116
659k
        vs3 = _mm_setzero_si128();
117
118
1.32M
        while (k >= 16) {
119
            /*
120
               vs1 = adler + sum(c[i])
121
               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
122
            */
123
659k
            vbuf = _mm_load_si128((__m128i*)buf);
124
659k
            buf += 16;
125
659k
            k -= 16;
126
127
669k
unaligned_jmp:
128
669k
            v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
129
669k
            vs1 = _mm_add_epi32(v_sad_sum1, vs1);
130
669k
            vs3 = _mm_add_epi32(vs1_0, vs3);
131
669k
            v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
132
669k
            vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
133
669k
            vs2 = _mm_add_epi32(vsum2, vs2);
134
669k
            vs1_0 = vs1;
135
669k
        }
136
137
669k
        vs3 = _mm_slli_epi32(vs3, 4);
138
669k
        vs2 = _mm_add_epi32(vs2, vs3);
139
140
        /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
141
         * a partial reduction sum implicitly and only summing to integers in vector positions
142
         * 0 and 2. This saves us some contention on the shuffle port(s) */
143
669k
        adler = partial_hsum(vs1) % BASE;
144
669k
        sum2 = hsum(vs2) % BASE;
145
669k
        max_iters = NMAX;
146
669k
    }
147
148
    /* Process tail (len < 16).  */
149
669k
    return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
150
659k
}
151
152
669k
Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
153
669k
    return adler32_impl(adler, buf, len);
154
669k
}
155
156
/* SSSE3 unaligned stores have a huge penalty, so we use memcpy. */
157
0
Z_INTERNAL uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
158
0
    adler = adler32_impl(adler, src, len);
159
0
    memcpy(dst, src, len);
160
0
    return adler;
161
0
}
162
#endif