Coverage Report

Created: 2026-05-28 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/adler32_p.h
Line
Count
Source
1
/* adler32_p.h -- Private inline functions and macros shared with
2
 *                different computation of the Adler-32 checksum
3
 *                of a data stream.
4
 * Copyright (C) 1995-2011, 2016 Mark Adler
5
 * For conditions of distribution and use, see copyright notice in zlib.h
6
 */
7
8
#ifndef ADLER32_P_H
9
#define ADLER32_P_H
10
11
#include "zendian.h"
12
13
359k
#define BASE 65521U     /* largest prime smaller than 65536 */
14
2.83k
#define NMAX 5552
15
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
16
#define NMAX_ALIGNED32 (NMAX & ~31)
17
/* NMAX rounded down to a multiple of 32 is 5536 */
18
19
35.8k
#define ADLER_DO1(sum1, sum2, buf, i)  {(sum1) += buf[(i)]; (sum2) += (sum1);}
20
15.4k
#define ADLER_DO2(sum1, sum2, buf, i)  {ADLER_DO1(sum1, sum2, buf, i); ADLER_DO1(sum1, sum2, buf, i+1);}
21
5.12k
#define ADLER_DO4(sum1, sum2, buf, i)  {ADLER_DO2(sum1, sum2, buf, i); ADLER_DO2(sum1, sum2, buf, i+2);}
22
#define ADLER_DO8(sum1, sum2, buf, i)  {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);}
23
#define ADLER_DO16(sum1, sum2, buf)    {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);}
24
25
Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t *dst, const uint8_t *buf, size_t len,
26
0
                                             uint32_t *Z_RESTRICT sum2, const int MAX_LEN, const int COPY) {
27
0
    Z_UNUSED(MAX_LEN);
28
0
    if (len & 1) {
29
0
        if (COPY) {
30
0
            *dst = *buf;
31
0
            dst += 1;
32
0
        }
33
0
        ADLER_DO1(*adler, *sum2, buf, 0);
34
0
        buf += 1;
35
0
    }
36
0
    if (len & 2) {
37
0
        if (COPY) {
38
0
            memcpy(dst, buf, 2);
39
0
            dst += 2;
40
0
        }
41
0
        ADLER_DO2(*adler, *sum2, buf, 0);
42
0
        buf += 2;
43
0
    }
44
0
    while (len >= 4) {
45
0
        if (COPY) {
46
0
            memcpy(dst, buf, 4);
47
0
            dst += 4;
48
0
        }
49
0
        len -= 4;
50
0
        ADLER_DO4(*adler, *sum2, buf, 0);
51
0
        buf += 4;
52
0
    }
53
0
}
Unexecuted instantiation: adler32_ssse3.c:adler32_copy_align
Unexecuted instantiation: adler32_sse42.c:adler32_copy_align
Unexecuted instantiation: adler32_avx2.c:adler32_copy_align
Unexecuted instantiation: adler32_avx512.c:adler32_copy_align
Unexecuted instantiation: adler32_avx512_vnni.c:adler32_copy_align
Unexecuted instantiation: adler32_c.c:adler32_copy_align
54
55
/* SIMD Within A Register (SWAR) scalar adler32. Splits bytes into
56
 * even/odd lanes packed as 4x16-bit in uint64_t, with prefix sums for s2.
57
 * Reduction uses multiply-and-shift with positional weight constants.
58
 *
59
 * Technique pioneered by Michael Niedermayer <michaelni@gmx.at>.
60
 * Max chunk: 23 iterations * 8 bytes = 184 (255*23 = 5865 < 65535). */
61
#define ADLER32_SWAR_MAX_BYTES   (23 * 8)
62
8.59k
#define ADLER32_SWAR_EVEN_MASK   0x00FF00FF00FF00FFULL
63
8.59k
#define ADLER32_SWAR_HSUM        0x1000100010001ULL
64
65
Z_FORCEINLINE static void adler32_swar(uint32_t *adler, uint8_t *dst, const uint8_t *buf, size_t len,
66
4.29k
                                       uint32_t *sum2, const int COPY) {
67
4.29k
    uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0;
68
69
4.29k
    *sum2 += *adler * (uint32_t)len;
70
71
4.29k
    const uint64_t *src64 = (const uint64_t *)buf;
72
73
4.29k
    while (len >= 16) {
74
0
        uint64_t v0 = src64[0];
75
0
        uint64_t v1 = src64[1];
76
0
        if (COPY) {
77
0
            memcpy(dst, &v0, sizeof(v0));
78
0
            memcpy(dst + 8, &v1, sizeof(v1));
79
0
            dst += 16;
80
0
        }
81
82
0
        prefix_even += sum_even;
83
0
        prefix_odd += sum_odd;
84
0
        sum_even +=  v0       & ADLER32_SWAR_EVEN_MASK;
85
0
        sum_odd  += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK;
86
87
0
        prefix_even += sum_even;
88
0
        prefix_odd += sum_odd;
89
0
        sum_even +=  v1       & ADLER32_SWAR_EVEN_MASK;
90
0
        sum_odd  += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK;
91
92
0
        src64 += 2;
93
0
        len -= 16;
94
0
    }
95
96
    /* Handle remaining 8 bytes if present */
97
4.29k
    if (len >= 8) {
98
4.29k
        uint64_t v = *src64;
99
4.29k
        if (COPY)
100
2.98k
            memcpy(dst, &v, sizeof(v));
101
102
4.29k
        prefix_even += sum_even;
103
4.29k
        prefix_odd += sum_odd;
104
4.29k
        sum_even +=  v       & ADLER32_SWAR_EVEN_MASK;
105
4.29k
        sum_odd  += (v >> 8) & ADLER32_SWAR_EVEN_MASK;
106
4.29k
    }
107
108
    /* Horizontal sum of 4x16-bit lanes for s1 */
109
4.29k
    *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48);
110
111
    /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */
112
4.29k
    uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL;
113
4.29k
    uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL;
114
4.29k
    uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL;
115
4.29k
    uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL;
116
117
4.29k
    *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32);
118
119
    /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2.
120
     * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7)
121
     * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */
122
4.29k
#if BYTE_ORDER == LITTLE_ENDIAN
123
4.29k
    *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48)
124
4.29k
           +     (uint32_t)((sum_odd  * ADLER32_SWAR_HSUM) >> 48)
125
4.29k
           + 2 * (uint32_t)((sum_odd  * 0x3000200010000ULL) >> 48);
126
#else
127
    *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48)
128
           +     (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48)
129
           + 2 * (uint32_t)((sum_odd  * 0x1000200030004ULL) >> 48);
130
#endif
131
4.29k
}
adler32_ssse3.c:adler32_swar
Line
Count
Source
66
591
                                       uint32_t *sum2, const int COPY) {
67
591
    uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0;
68
69
591
    *sum2 += *adler * (uint32_t)len;
70
71
591
    const uint64_t *src64 = (const uint64_t *)buf;
72
73
591
    while (len >= 16) {
74
0
        uint64_t v0 = src64[0];
75
0
        uint64_t v1 = src64[1];
76
0
        if (COPY) {
77
0
            memcpy(dst, &v0, sizeof(v0));
78
0
            memcpy(dst + 8, &v1, sizeof(v1));
79
0
            dst += 16;
80
0
        }
81
82
0
        prefix_even += sum_even;
83
0
        prefix_odd += sum_odd;
84
0
        sum_even +=  v0       & ADLER32_SWAR_EVEN_MASK;
85
0
        sum_odd  += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK;
86
87
0
        prefix_even += sum_even;
88
0
        prefix_odd += sum_odd;
89
0
        sum_even +=  v1       & ADLER32_SWAR_EVEN_MASK;
90
0
        sum_odd  += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK;
91
92
0
        src64 += 2;
93
0
        len -= 16;
94
0
    }
95
96
    /* Handle remaining 8 bytes if present */
97
591
    if (len >= 8) {
98
591
        uint64_t v = *src64;
99
591
        if (COPY)
100
0
            memcpy(dst, &v, sizeof(v));
101
102
591
        prefix_even += sum_even;
103
591
        prefix_odd += sum_odd;
104
591
        sum_even +=  v       & ADLER32_SWAR_EVEN_MASK;
105
591
        sum_odd  += (v >> 8) & ADLER32_SWAR_EVEN_MASK;
106
591
    }
107
108
    /* Horizontal sum of 4x16-bit lanes for s1 */
109
591
    *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48);
110
111
    /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */
112
591
    uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL;
113
591
    uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL;
114
591
    uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL;
115
591
    uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL;
116
117
591
    *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32);
118
119
    /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2.
120
     * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7)
121
     * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */
122
591
#if BYTE_ORDER == LITTLE_ENDIAN
123
591
    *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48)
124
591
           +     (uint32_t)((sum_odd  * ADLER32_SWAR_HSUM) >> 48)
125
591
           + 2 * (uint32_t)((sum_odd  * 0x3000200010000ULL) >> 48);
126
#else
127
    *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48)
128
           +     (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48)
129
           + 2 * (uint32_t)((sum_odd  * 0x1000200030004ULL) >> 48);
130
#endif
131
591
}
adler32_sse42.c:adler32_swar
Line
Count
Source
66
1.58k
                                       uint32_t *sum2, const int COPY) {
67
1.58k
    uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0;
68
69
1.58k
    *sum2 += *adler * (uint32_t)len;
70
71
1.58k
    const uint64_t *src64 = (const uint64_t *)buf;
72
73
1.58k
    while (len >= 16) {
74
0
        uint64_t v0 = src64[0];
75
0
        uint64_t v1 = src64[1];
76
0
        if (COPY) {
77
0
            memcpy(dst, &v0, sizeof(v0));
78
0
            memcpy(dst + 8, &v1, sizeof(v1));
79
0
            dst += 16;
80
0
        }
81
82
0
        prefix_even += sum_even;
83
0
        prefix_odd += sum_odd;
84
0
        sum_even +=  v0       & ADLER32_SWAR_EVEN_MASK;
85
0
        sum_odd  += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK;
86
87
0
        prefix_even += sum_even;
88
0
        prefix_odd += sum_odd;
89
0
        sum_even +=  v1       & ADLER32_SWAR_EVEN_MASK;
90
0
        sum_odd  += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK;
91
92
0
        src64 += 2;
93
0
        len -= 16;
94
0
    }
95
96
    /* Handle remaining 8 bytes if present */
97
1.58k
    if (len >= 8) {
98
1.58k
        uint64_t v = *src64;
99
1.58k
        if (COPY)
100
1.58k
            memcpy(dst, &v, sizeof(v));
101
102
1.58k
        prefix_even += sum_even;
103
1.58k
        prefix_odd += sum_odd;
104
1.58k
        sum_even +=  v       & ADLER32_SWAR_EVEN_MASK;
105
1.58k
        sum_odd  += (v >> 8) & ADLER32_SWAR_EVEN_MASK;
106
1.58k
    }
107
108
    /* Horizontal sum of 4x16-bit lanes for s1 */
109
1.58k
    *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48);
110
111
    /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */
112
1.58k
    uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL;
113
1.58k
    uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL;
114
1.58k
    uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL;
115
1.58k
    uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL;
116
117
1.58k
    *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32);
118
119
    /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2.
120
     * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7)
121
     * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */
122
1.58k
#if BYTE_ORDER == LITTLE_ENDIAN
123
1.58k
    *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48)
124
1.58k
           +     (uint32_t)((sum_odd  * ADLER32_SWAR_HSUM) >> 48)
125
1.58k
           + 2 * (uint32_t)((sum_odd  * 0x3000200010000ULL) >> 48);
126
#else
127
    *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48)
128
           +     (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48)
129
           + 2 * (uint32_t)((sum_odd  * 0x1000200030004ULL) >> 48);
130
#endif
131
1.58k
}
adler32_avx2.c:adler32_swar
Line
Count
Source
66
2.12k
                                       uint32_t *sum2, const int COPY) {
67
2.12k
    uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0;
68
69
2.12k
    *sum2 += *adler * (uint32_t)len;
70
71
2.12k
    const uint64_t *src64 = (const uint64_t *)buf;
72
73
2.12k
    while (len >= 16) {
74
0
        uint64_t v0 = src64[0];
75
0
        uint64_t v1 = src64[1];
76
0
        if (COPY) {
77
0
            memcpy(dst, &v0, sizeof(v0));
78
0
            memcpy(dst + 8, &v1, sizeof(v1));
79
0
            dst += 16;
80
0
        }
81
82
0
        prefix_even += sum_even;
83
0
        prefix_odd += sum_odd;
84
0
        sum_even +=  v0       & ADLER32_SWAR_EVEN_MASK;
85
0
        sum_odd  += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK;
86
87
0
        prefix_even += sum_even;
88
0
        prefix_odd += sum_odd;
89
0
        sum_even +=  v1       & ADLER32_SWAR_EVEN_MASK;
90
0
        sum_odd  += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK;
91
92
0
        src64 += 2;
93
0
        len -= 16;
94
0
    }
95
96
    /* Handle remaining 8 bytes if present */
97
2.12k
    if (len >= 8) {
98
2.12k
        uint64_t v = *src64;
99
2.12k
        if (COPY)
100
1.40k
            memcpy(dst, &v, sizeof(v));
101
102
2.12k
        prefix_even += sum_even;
103
2.12k
        prefix_odd += sum_odd;
104
2.12k
        sum_even +=  v       & ADLER32_SWAR_EVEN_MASK;
105
2.12k
        sum_odd  += (v >> 8) & ADLER32_SWAR_EVEN_MASK;
106
2.12k
    }
107
108
    /* Horizontal sum of 4x16-bit lanes for s1 */
109
2.12k
    *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48);
110
111
    /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */
112
2.12k
    uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL;
113
2.12k
    uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL;
114
2.12k
    uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL;
115
2.12k
    uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL;
116
117
2.12k
    *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32);
118
119
    /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2.
120
     * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7)
121
     * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */
122
2.12k
#if BYTE_ORDER == LITTLE_ENDIAN
123
2.12k
    *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48)
124
2.12k
           +     (uint32_t)((sum_odd  * ADLER32_SWAR_HSUM) >> 48)
125
2.12k
           + 2 * (uint32_t)((sum_odd  * 0x3000200010000ULL) >> 48);
126
#else
127
    *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48)
128
           +     (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48)
129
           + 2 * (uint32_t)((sum_odd  * 0x1000200030004ULL) >> 48);
130
#endif
131
2.12k
}
Unexecuted instantiation: adler32_avx512.c:adler32_swar
Unexecuted instantiation: adler32_avx512_vnni.c:adler32_swar
Unexecuted instantiation: adler32_c.c:adler32_swar
132
133
Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len,
134
9.43k
                                                uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
135
9.43k
    if (len) {
136
9.34k
        Z_UNUSED(MAX_LEN);
137
        /* Process using packed 64-bit arithmetic when source is aligned */
138
13.6k
        while (len >= 8 && ((uintptr_t)buf & 7) == 0) {
139
4.29k
            size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES);
140
4.29k
            adler32_swar(&adler, dst, buf, chunk, &sum2, COPY);
141
4.29k
            buf += chunk;
142
4.29k
            if (COPY)
143
2.98k
                dst += chunk;
144
4.29k
            len -= chunk;
145
4.29k
        }
146
        /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
147
14.4k
        while (len >= 4) {
148
5.12k
            if (COPY) {
149
3.91k
                memcpy(dst, buf, 4);
150
3.91k
                dst += 4;
151
3.91k
            }
152
5.12k
            len -= 4;
153
5.12k
            ADLER_DO4(adler, sum2, buf, 0);
154
5.12k
            buf += 4;
155
5.12k
        }
156
9.34k
        if (len & 2) {
157
5.19k
            if (COPY) {
158
3.67k
                memcpy(dst, buf, 2);
159
3.67k
                dst += 2;
160
3.67k
            }
161
5.19k
            ADLER_DO2(adler, sum2, buf, 0);
162
5.19k
            buf += 2;
163
5.19k
        }
164
9.34k
        if (len & 1) {
165
5.00k
            if (COPY)
166
3.54k
                *dst = *buf;
167
5.00k
            ADLER_DO1(adler, sum2, buf, 0);
168
5.00k
        }
169
9.34k
    }
170
9.43k
    if (REBASE) {
171
9.34k
        adler %= BASE;
172
9.34k
        sum2 %= BASE;
173
9.34k
    }
174
    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
175
9.43k
    return adler | (sum2 << 16);
176
9.43k
}
adler32_ssse3.c:adler32_copy_tail
Line
Count
Source
134
1.41k
                                                uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
135
1.41k
    if (len) {
136
1.32k
        Z_UNUSED(MAX_LEN);
137
        /* Process using packed 64-bit arithmetic when source is aligned */
138
1.92k
        while (len >= 8 && ((uintptr_t)buf & 7) == 0) {
139
591
            size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES);
140
591
            adler32_swar(&adler, dst, buf, chunk, &sum2, COPY);
141
591
            buf += chunk;
142
591
            if (COPY)
143
0
                dst += chunk;
144
591
            len -= chunk;
145
591
        }
146
        /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
147
1.92k
        while (len >= 4) {
148
591
            if (COPY) {
149
0
                memcpy(dst, buf, 4);
150
0
                dst += 4;
151
0
            }
152
591
            len -= 4;
153
591
            ADLER_DO4(adler, sum2, buf, 0);
154
591
            buf += 4;
155
591
        }
156
1.32k
        if (len & 2) {
157
691
            if (COPY) {
158
0
                memcpy(dst, buf, 2);
159
0
                dst += 2;
160
0
            }
161
691
            ADLER_DO2(adler, sum2, buf, 0);
162
691
            buf += 2;
163
691
        }
164
1.32k
        if (len & 1) {
165
689
            if (COPY)
166
0
                *dst = *buf;
167
689
            ADLER_DO1(adler, sum2, buf, 0);
168
689
        }
169
1.32k
    }
170
1.41k
    if (REBASE) {
171
1.32k
        adler %= BASE;
172
1.32k
        sum2 %= BASE;
173
1.32k
    }
174
    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
175
1.41k
    return adler | (sum2 << 16);
176
1.41k
}
adler32_sse42.c:adler32_copy_tail
Line
Count
Source
134
3.37k
                                                uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
135
3.37k
    if (len) {
136
3.37k
        Z_UNUSED(MAX_LEN);
137
        /* Process using packed 64-bit arithmetic when source is aligned */
138
4.95k
        while (len >= 8 && ((uintptr_t)buf & 7) == 0) {
139
1.58k
            size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES);
140
1.58k
            adler32_swar(&adler, dst, buf, chunk, &sum2, COPY);
141
1.58k
            buf += chunk;
142
1.58k
            if (COPY)
143
1.58k
                dst += chunk;
144
1.58k
            len -= chunk;
145
1.58k
        }
146
        /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
147
5.39k
        while (len >= 4) {
148
2.02k
            if (COPY) {
149
2.02k
                memcpy(dst, buf, 4);
150
2.02k
                dst += 4;
151
2.02k
            }
152
2.02k
            len -= 4;
153
2.02k
            ADLER_DO4(adler, sum2, buf, 0);
154
2.02k
            buf += 4;
155
2.02k
        }
156
3.37k
        if (len & 2) {
157
1.83k
            if (COPY) {
158
1.83k
                memcpy(dst, buf, 2);
159
1.83k
                dst += 2;
160
1.83k
            }
161
1.83k
            ADLER_DO2(adler, sum2, buf, 0);
162
1.83k
            buf += 2;
163
1.83k
        }
164
3.37k
        if (len & 1) {
165
1.76k
            if (COPY)
166
1.76k
                *dst = *buf;
167
1.76k
            ADLER_DO1(adler, sum2, buf, 0);
168
1.76k
        }
169
3.37k
    }
170
3.37k
    if (REBASE) {
171
3.37k
        adler %= BASE;
172
3.37k
        sum2 %= BASE;
173
3.37k
    }
174
    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
175
3.37k
    return adler | (sum2 << 16);
176
3.37k
}
adler32_avx2.c:adler32_copy_tail
Line
Count
Source
134
4.64k
                                                uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
135
4.64k
    if (len) {
136
4.64k
        Z_UNUSED(MAX_LEN);
137
        /* Process using packed 64-bit arithmetic when source is aligned */
138
6.76k
        while (len >= 8 && ((uintptr_t)buf & 7) == 0) {
139
2.12k
            size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES);
140
2.12k
            adler32_swar(&adler, dst, buf, chunk, &sum2, COPY);
141
2.12k
            buf += chunk;
142
2.12k
            if (COPY)
143
1.40k
                dst += chunk;
144
2.12k
            len -= chunk;
145
2.12k
        }
146
        /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
147
7.15k
        while (len >= 4) {
148
2.50k
            if (COPY) {
149
1.88k
                memcpy(dst, buf, 4);
150
1.88k
                dst += 4;
151
1.88k
            }
152
2.50k
            len -= 4;
153
2.50k
            ADLER_DO4(adler, sum2, buf, 0);
154
2.50k
            buf += 4;
155
2.50k
        }
156
4.64k
        if (len & 2) {
157
2.66k
            if (COPY) {
158
1.83k
                memcpy(dst, buf, 2);
159
1.83k
                dst += 2;
160
1.83k
            }
161
2.66k
            ADLER_DO2(adler, sum2, buf, 0);
162
2.66k
            buf += 2;
163
2.66k
        }
164
4.64k
        if (len & 1) {
165
2.54k
            if (COPY)
166
1.78k
                *dst = *buf;
167
2.54k
            ADLER_DO1(adler, sum2, buf, 0);
168
2.54k
        }
169
4.64k
    }
170
4.64k
    if (REBASE) {
171
4.64k
        adler %= BASE;
172
4.64k
        sum2 %= BASE;
173
4.64k
    }
174
    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
175
4.64k
    return adler | (sum2 << 16);
176
4.64k
}
Unexecuted instantiation: adler32_avx512.c:adler32_copy_tail
Unexecuted instantiation: adler32_avx512_vnni.c:adler32_copy_tail
Unexecuted instantiation: adler32_c.c:adler32_copy_tail
177
178
#endif /* ADLER32_P_H */