Coverage Report

Created: 2026-05-30 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/chunkset_tpl.h
Line
Count
Source
1
/* chunkset_tpl.h -- inline functions to copy small data chunks.
2
 * For conditions of distribution and use, see copyright notice in zlib.h
3
 */
4
5
#include "zbuild.h"
6
#include <stdlib.h>
7
8
/* Returns the chunk size */
9
1.24M
static inline size_t CHUNKSIZE(void) {
10
1.24M
    return sizeof(chunk_t);
11
1.24M
}
Unexecuted instantiation: chunkset_sse2.c:chunksize_sse2
Unexecuted instantiation: chunkset_ssse3.c:chunksize_ssse3
chunkset_avx2.c:chunksize_avx2
Line
Count
Source
9
1.24M
static inline size_t CHUNKSIZE(void) {
10
1.24M
    return sizeof(chunk_t);
11
1.24M
}
Unexecuted instantiation: chunkset_avx512.c:chunksize_avx512
12
13
/* Behave like memcpy, but assume that it's OK to overwrite at least
14
   chunk_t bytes of output even if the length is shorter than this,
15
   that the length is non-zero, and that `from` lags `out` by at least
16
   sizeof chunk_t bytes (or that they don't overlap at all or simply that
17
   the distance is less than the length of the copy).
18
19
   Aside from better memory bus utilization, this means that short copies
20
   (chunk_t bytes or fewer) will fall straight through the loop
21
   without iteration, which will hopefully make the branch prediction more
22
   reliable. */
23
#ifndef HAVE_CHUNKCOPY
24
15.0M
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
25
15.0M
    Assert(len > 0, "chunkcopy should never have a length 0");
26
15.0M
    chunk_t chunk;
27
15.0M
    size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
28
15.0M
    loadchunk(from, &chunk);
29
15.0M
    storechunk(out, &chunk);
30
15.0M
    out += align;
31
15.0M
    from += align;
32
15.0M
    len -= align;
33
21.8M
    while (len > 0) {
34
6.72M
        loadchunk(from, &chunk);
35
6.72M
        storechunk(out, &chunk);
36
6.72M
        out += sizeof(chunk_t);
37
6.72M
        from += sizeof(chunk_t);
38
6.72M
        len -= sizeof(chunk_t);
39
6.72M
    }
40
15.0M
    return out;
41
15.0M
}
Unexecuted instantiation: chunkset_sse2.c:chunkcopy_sse2
Unexecuted instantiation: chunkset_ssse3.c:chunkcopy_ssse3
chunkset_avx2.c:chunkcopy_avx2
Line
Count
Source
24
15.0M
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
25
15.0M
    Assert(len > 0, "chunkcopy should never have a length 0");
26
15.0M
    chunk_t chunk;
27
15.0M
    size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
28
15.0M
    loadchunk(from, &chunk);
29
15.0M
    storechunk(out, &chunk);
30
15.0M
    out += align;
31
15.0M
    from += align;
32
15.0M
    len -= align;
33
21.8M
    while (len > 0) {
34
6.72M
        loadchunk(from, &chunk);
35
6.72M
        storechunk(out, &chunk);
36
6.72M
        out += sizeof(chunk_t);
37
6.72M
        from += sizeof(chunk_t);
38
6.72M
        len -= sizeof(chunk_t);
39
6.72M
    }
40
15.0M
    return out;
41
15.0M
}
42
#endif
43
44
/* Perform short copies until distance can be rewritten as being at least
45
   sizeof chunk_t.
46
47
   This assumes that it's OK to overwrite at least the first
48
   2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
49
   This assumption holds because inflate_fast() starts every iteration with at
50
   least 258 bytes of output space available (258 being the maximum length
51
   output from a single token; see inflate_fast()'s assumptions below). */
52
0
static inline uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
53
0
    unsigned char const *from = out - *dist;
54
0
    chunk_t chunk;
55
0
    while (*dist < *len && *dist < sizeof(chunk_t)) {
56
0
        loadchunk(from, &chunk);
57
0
        storechunk(out, &chunk);
58
0
        out += *dist;
59
0
        *len -= *dist;
60
0
        *dist += *dist;
61
0
    }
62
0
    return out;
63
0
}
Unexecuted instantiation: chunkset_sse2.c:chunkunroll_sse2
Unexecuted instantiation: chunkset_ssse3.c:chunkunroll_ssse3
Unexecuted instantiation: chunkset_avx2.c:chunkunroll_avx2
Unexecuted instantiation: chunkset_avx512.c:chunkunroll_avx512
64
65
#ifndef HAVE_CHUNK_MAG
66
/* Loads a magazine to feed into memory of the pattern */
67
0
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
68
        /* This code takes string of length dist from "from" and repeats
69
         * it for as many times as can fit in a chunk_t (vector register) */
70
0
        size_t cpy_dist;
71
0
        size_t bytes_remaining = sizeof(chunk_t);
72
0
        chunk_t chunk_load;
73
0
        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
74
0
        while (bytes_remaining) {
75
0
            cpy_dist = MIN(dist, bytes_remaining);
76
0
            memcpy(cur_chunk, buf, cpy_dist);
77
0
            bytes_remaining -= cpy_dist;
78
0
            cur_chunk += cpy_dist;
79
            /* This allows us to bypass an expensive integer division since we're effectively
80
             * counting in this loop, anyway */
81
0
            *chunk_rem = cpy_dist;
82
0
        }
83
84
0
        return chunk_load;
85
0
}
86
#endif
87
88
#if defined(HAVE_HALF_CHUNK) && !defined(HAVE_HALFCHUNKCOPY)
89
231
static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
90
231
    Assert(len > 0, "halfchunkcopy should never have a length 0");
91
231
    halfchunk_t chunk;
92
231
    size_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
93
231
    loadhalfchunk(from, &chunk);
94
231
    storehalfchunk(out, &chunk);
95
231
    out += align;
96
231
    from += align;
97
231
    len -= align;
98
231
    while (len > 0) {
99
0
        loadhalfchunk(from, &chunk);
100
0
        storehalfchunk(out, &chunk);
101
0
        out += sizeof(halfchunk_t);
102
0
        from += sizeof(halfchunk_t);
103
0
        len -= sizeof(halfchunk_t);
104
0
    }
105
231
    return out;
106
231
}
107
#endif
108
109
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
110
   Return OUT + LEN. */
111
1.23M
static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, size_t len) {
112
    /* Debug performance related issues when len < sizeof(uint64_t):
113
       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
114
1.23M
    Assert(from != out, "chunkmemset cannot have a distance 0");
115
116
1.23M
    chunk_t chunk_load;
117
1.23M
    size_t chunk_mod = 0;
118
1.23M
    size_t adv_amount;
119
1.23M
    size_t dist = (size_t)ABS(out - from);
120
121
    /* We are supporting the case for when we are reading bytes from ahead in the buffer.
122
     * We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
123
     * always needed to be handled here or if we're just now seeing it because we are
124
     * dispatching to this function, more */
125
1.23M
    if (out < from && dist < len) {
126
#ifdef HAVE_MASKED_READWRITE
127
        /* We can still handle this case if we can mitigate over writing _and_ we
128
         * fit the entirety of the copy length with one load */
129
0
        if (len <= sizeof(chunk_t)) {
130
            /* Tempting to add a goto to the block below but hopefully most compilers
131
             * collapse these identical code segments as one label to jump to */
132
0
            return CHUNKCOPY(out, from, len);
133
0
        }
134
0
#endif
135
        /* Here the memmove semantics match perfectly, as when this happens we are
136
         * effectively sliding down the contents of memory by dist bytes */
137
0
        memmove(out, from, len);
138
0
        return out + len;
139
0
    }
140
141
1.23M
    if (dist == 1) {
142
660k
        memset(out, *from, len);
143
660k
        return out + len;
144
660k
    } else if (dist >= sizeof(chunk_t)) {
145
3.87k
        return CHUNKCOPY(out, from, len);
146
3.87k
    }
147
148
    /* Only AVX2+ as there's 128 bit vectors and 256 bit. We allow for shorter vector
149
     * lengths because they serve to allow more cases to fall into chunkcopy, as the
150
     * distance of the shorter length is still deemed a safe distance. We rewrite this
151
     * here rather than calling the ssse3 variant directly now because doing so required
152
     * dispatching to another function and broke inlining for this function entirely. We
153
     * also can merge an assert and some remainder peeling behavior into the same code blocks,
154
     * making the code a little smaller.  */
155
#ifdef HAVE_HALF_CHUNK
156
566k
    if (len <= sizeof(halfchunk_t)) {
157
242k
        if (dist >= sizeof(halfchunk_t))
158
231
            return HALFCHUNKCOPY(out, from, len);
159
160
242k
        if ((dist % 2) != 0 || dist == 6) {
161
60.5k
            halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
162
163
60.5k
            if (len == sizeof(halfchunk_t)) {
164
1.19k
                storehalfchunk(out, &halfchunk_load);
165
1.19k
                len -= sizeof(halfchunk_t);
166
1.19k
                out += sizeof(halfchunk_t);
167
1.19k
            }
168
169
60.5k
            chunk_load = halfchunk2whole(&halfchunk_load);
170
60.5k
            goto rem_bytes;
171
60.5k
        }
172
242k
    }
173
505k
#endif
174
175
505k
#ifdef HAVE_CHUNKMEMSET_2
176
505k
    if (dist == 2) {
177
183k
        chunkmemset_2(from, &chunk_load);
178
183k
    } else
179
322k
#endif
180
322k
#ifdef HAVE_CHUNKMEMSET_4
181
322k
    if (dist == 4) {
182
62.7k
        chunkmemset_4(from, &chunk_load);
183
62.7k
    } else
184
259k
#endif
185
259k
#ifdef HAVE_CHUNKMEMSET_8
186
259k
    if (dist == 8) {
187
5.11k
        chunkmemset_8(from, &chunk_load);
188
5.11k
    } else
189
254k
#endif
190
#ifdef HAVE_CHUNKMEMSET_16
191
254k
    if (dist == 16) {
192
2.06k
        chunkmemset_16(from, &chunk_load);
193
2.06k
    } else
194
252k
#endif
195
252k
    chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
196
197
0
    adv_amount = sizeof(chunk_t) - chunk_mod;
198
199
1.25M
    while (len >= (2 * sizeof(chunk_t))) {
200
751k
        storechunk(out, &chunk_load);
201
751k
        storechunk(out + adv_amount, &chunk_load);
202
751k
        out += 2 * adv_amount;
203
751k
        len -= 2 * adv_amount;
204
751k
    }
205
206
    /* If we don't have a "dist" length that divides evenly into a vector
207
     * register, we can write the whole vector register but we need only
208
     * advance by the amount of the whole string that fits in our chunk_t.
209
     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
210
564k
    while (len >= sizeof(chunk_t)) {
211
59.1k
        storechunk(out, &chunk_load);
212
59.1k
        len -= adv_amount;
213
59.1k
        out += adv_amount;
214
59.1k
    }
215
216
#ifdef HAVE_HALF_CHUNK
217
566k
rem_bytes:
218
566k
#endif
219
566k
    if (len) {
220
560k
        memcpy(out, &chunk_load, len);
221
560k
        out += len;
222
560k
    }
223
224
566k
    return out;
225
505k
}
Unexecuted instantiation: chunkset_sse2.c:chunkmemset_sse2
Unexecuted instantiation: chunkset_ssse3.c:chunkmemset_ssse3
chunkset_avx2.c:chunkmemset_avx2
Line
Count
Source
111
1.23M
static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, size_t len) {
112
    /* Debug performance related issues when len < sizeof(uint64_t):
113
       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
114
1.23M
    Assert(from != out, "chunkmemset cannot have a distance 0");
115
116
1.23M
    chunk_t chunk_load;
117
1.23M
    size_t chunk_mod = 0;
118
1.23M
    size_t adv_amount;
119
1.23M
    size_t dist = (size_t)ABS(out - from);
120
121
    /* We are supporting the case for when we are reading bytes from ahead in the buffer.
122
     * We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
123
     * always needed to be handled here or if we're just now seeing it because we are
124
     * dispatching to this function, more */
125
1.23M
    if (out < from && dist < len) {
126
#ifdef HAVE_MASKED_READWRITE
127
        /* We can still handle this case if we can mitigate over writing _and_ we
128
         * fit the entirety of the copy length with one load */
129
        if (len <= sizeof(chunk_t)) {
130
            /* Tempting to add a goto to the block below but hopefully most compilers
131
             * collapse these identical code segments as one label to jump to */
132
            return CHUNKCOPY(out, from, len);
133
        }
134
#endif
135
        /* Here the memmove semantics match perfectly, as when this happens we are
136
         * effectively sliding down the contents of memory by dist bytes */
137
0
        memmove(out, from, len);
138
0
        return out + len;
139
0
    }
140
141
1.23M
    if (dist == 1) {
142
660k
        memset(out, *from, len);
143
660k
        return out + len;
144
660k
    } else if (dist >= sizeof(chunk_t)) {
145
3.87k
        return CHUNKCOPY(out, from, len);
146
3.87k
    }
147
148
    /* Only AVX2+ as there's 128 bit vectors and 256 bit. We allow for shorter vector
149
     * lengths because they serve to allow more cases to fall into chunkcopy, as the
150
     * distance of the shorter length is still deemed a safe distance. We rewrite this
151
     * here rather than calling the ssse3 variant directly now because doing so required
152
     * dispatching to another function and broke inlining for this function entirely. We
153
     * also can merge an assert and some remainder peeling behavior into the same code blocks,
154
     * making the code a little smaller.  */
155
566k
#ifdef HAVE_HALF_CHUNK
156
566k
    if (len <= sizeof(halfchunk_t)) {
157
242k
        if (dist >= sizeof(halfchunk_t))
158
231
            return HALFCHUNKCOPY(out, from, len);
159
160
242k
        if ((dist % 2) != 0 || dist == 6) {
161
60.5k
            halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
162
163
60.5k
            if (len == sizeof(halfchunk_t)) {
164
1.19k
                storehalfchunk(out, &halfchunk_load);
165
1.19k
                len -= sizeof(halfchunk_t);
166
1.19k
                out += sizeof(halfchunk_t);
167
1.19k
            }
168
169
60.5k
            chunk_load = halfchunk2whole(&halfchunk_load);
170
60.5k
            goto rem_bytes;
171
60.5k
        }
172
242k
    }
173
505k
#endif
174
175
505k
#ifdef HAVE_CHUNKMEMSET_2
176
505k
    if (dist == 2) {
177
183k
        chunkmemset_2(from, &chunk_load);
178
183k
    } else
179
322k
#endif
180
322k
#ifdef HAVE_CHUNKMEMSET_4
181
322k
    if (dist == 4) {
182
62.7k
        chunkmemset_4(from, &chunk_load);
183
62.7k
    } else
184
259k
#endif
185
259k
#ifdef HAVE_CHUNKMEMSET_8
186
259k
    if (dist == 8) {
187
5.11k
        chunkmemset_8(from, &chunk_load);
188
5.11k
    } else
189
254k
#endif
190
254k
#ifdef HAVE_CHUNKMEMSET_16
191
254k
    if (dist == 16) {
192
2.06k
        chunkmemset_16(from, &chunk_load);
193
2.06k
    } else
194
252k
#endif
195
252k
    chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
196
197
505k
    adv_amount = sizeof(chunk_t) - chunk_mod;
198
199
1.25M
    while (len >= (2 * sizeof(chunk_t))) {
200
751k
        storechunk(out, &chunk_load);
201
751k
        storechunk(out + adv_amount, &chunk_load);
202
751k
        out += 2 * adv_amount;
203
751k
        len -= 2 * adv_amount;
204
751k
    }
205
206
    /* If we don't have a "dist" length that divides evenly into a vector
207
     * register, we can write the whole vector register but we need only
208
     * advance by the amount of the whole string that fits in our chunk_t.
209
     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
210
564k
    while (len >= sizeof(chunk_t)) {
211
59.1k
        storechunk(out, &chunk_load);
212
59.1k
        len -= adv_amount;
213
59.1k
        out += adv_amount;
214
59.1k
    }
215
216
505k
#ifdef HAVE_HALF_CHUNK
217
566k
rem_bytes:
218
566k
#endif
219
566k
    if (len) {
220
560k
        memcpy(out, &chunk_load, len);
221
560k
        out += len;
222
560k
    }
223
224
566k
    return out;
225
505k
}
Unexecuted instantiation: chunkset_avx512.c:chunkmemset_avx512
226
227
26.0k
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, size_t len, size_t left) {
228
#if OPTIMAL_CMP < 32
229
    static const uintptr_t align_mask = 7;
230
#elif OPTIMAL_CMP == 32
231
    static const uintptr_t align_mask = 3;
232
#endif
233
234
26.0k
    len = MIN(len, left);
235
236
#if OPTIMAL_CMP < 64
237
    while (((uintptr_t)out & align_mask) && (len > 0)) {
238
        *out++ = *from++;
239
        --len;
240
        --left;
241
    }
242
#endif
243
244
#ifndef HAVE_MASKED_READWRITE
245
26.0k
    if (UNLIKELY(left < sizeof(chunk_t))) {
246
119k
        while (len > 0) {
247
105k
            *out++ = *from++;
248
105k
            --len;
249
105k
        }
250
251
14.3k
        return out;
252
14.3k
    }
253
11.6k
#endif
254
255
11.6k
    if (len)
256
11.6k
        out = CHUNKMEMSET(out, from, len);
257
258
11.6k
    return out;
259
26.0k
}
Unexecuted instantiation: chunkmemset_safe_sse2
Unexecuted instantiation: chunkmemset_safe_ssse3
chunkmemset_safe_avx2
Line
Count
Source
227
26.0k
Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, size_t len, size_t left) {
228
#if OPTIMAL_CMP < 32
229
    static const uintptr_t align_mask = 7;
230
#elif OPTIMAL_CMP == 32
231
    static const uintptr_t align_mask = 3;
232
#endif
233
234
26.0k
    len = MIN(len, left);
235
236
#if OPTIMAL_CMP < 64
237
    while (((uintptr_t)out & align_mask) && (len > 0)) {
238
        *out++ = *from++;
239
        --len;
240
        --left;
241
    }
242
#endif
243
244
26.0k
#ifndef HAVE_MASKED_READWRITE
245
26.0k
    if (UNLIKELY(left < sizeof(chunk_t))) {
246
119k
        while (len > 0) {
247
105k
            *out++ = *from++;
248
105k
            --len;
249
105k
        }
250
251
14.3k
        return out;
252
14.3k
    }
253
11.6k
#endif
254
255
11.6k
    if (len)
256
11.6k
        out = CHUNKMEMSET(out, from, len);
257
258
11.6k
    return out;
259
26.0k
}
Unexecuted instantiation: chunkmemset_safe_avx512
260
261
static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe)
262
0
{
263
0
    if (out == from)
264
0
        return out + len;
265
266
0
    size_t safelen = (safe - out);
267
0
    len = MIN(len, safelen);
268
269
#ifndef HAVE_MASKED_READWRITE
270
0
    size_t from_dist = (size_t)ABS(safe - from);
271
0
    if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
272
0
        while (len--) {
273
0
            *out++ = *from++;
274
0
        }
275
276
0
        return out;
277
0
    }
278
0
#endif
279
280
0
    return CHUNKMEMSET(out, from, len);
281
0
}
Unexecuted instantiation: chunkset_sse2.c:CHUNKCOPY_SAFE
Unexecuted instantiation: chunkset_ssse3.c:CHUNKCOPY_SAFE
Unexecuted instantiation: chunkset_avx2.c:CHUNKCOPY_SAFE
Unexecuted instantiation: chunkset_avx512.c:CHUNKCOPY_SAFE