Coverage Report

Created: 2026-02-14 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/functable.c
Line
Count
Source
1
/* functable.c -- Choose relevant optimized functions at runtime
2
 * Copyright (C) 2017 Hans Kristian Rosbach
3
 * For conditions of distribution and use, see copyright notice in zlib.h
4
 */
5
#ifndef DISABLE_RUNTIME_CPU_DETECTION
6
7
#include "zbuild.h"
8
9
#if defined(_MSC_VER)
10
#  include <intrin.h>
11
#endif
12
13
#include "functable.h"
14
#include "cpu_features.h"
15
#include "arch_functions.h"
16
17
/* Platform has pointer size atomic store */
18
#if defined(__GNUC__) || defined(__clang__)
19
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
20
11
    __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
21
1
#  define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
22
#elif defined(_MSC_VER)
23
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
24
    _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
25
#  ifdef ARCH_ARM
26
#    define FUNCTABLE_BARRIER() do { \
27
    _ReadWriteBarrier();  \
28
    __dmb(0xB); /* _ARM_BARRIER_ISH */ \
29
    _ReadWriteBarrier(); \
30
} while (0)
31
#  else
32
#    define FUNCTABLE_BARRIER() _ReadWriteBarrier()
33
#  endif
34
#else
35
#  warning Unable to detect atomic intrinsic support.
36
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
37
    *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
38
#  define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
39
#endif
40
41
/* Verify all pointers are valid before assigning, return 1 on failure
42
 * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR
43
 * if functable initialization fails.
44
 */
45
#define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \
46
10
    if (!VAR.FUNC_NAME) { \
47
0
        fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
48
0
        return 1; \
49
0
    } \
50
10
    FUNCTABLE_ASSIGN(VAR, FUNC_NAME);
51
52
/* Functable init & abort on failure.
53
 * Abort is needed because some stub functions are reachable without first
54
 * calling any inflateinit/deflateinit functions, and have no error propagation.
55
 */
56
#define FUNCTABLE_INIT_ABORT \
57
0
    if (init_functable()) { \
58
0
        fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
59
0
        abort(); \
60
0
    };
61
62
// Empty stub, used when functable has already been initialized
63
8.68k
static int force_init_empty(void) {
64
8.68k
    return 0;
65
8.68k
}
66
67
/* Functable initialization.
68
 * Selects the best available optimized functions appropriate for the runtime cpu.
69
 */
70
1
static int init_functable(void) {
71
1
    struct functable_s ft;
72
1
    struct cpu_features cf;
73
74
1
    memset(&ft, 0, sizeof(struct functable_s));
75
1
    cpu_check_features(&cf);
76
1
    ft.force_init = &force_init_empty;
77
78
    // Set up generic C code fallbacks
79
1
#ifndef WITH_ALL_FALLBACKS
80
1
#  if defined(ARCH_X86) && defined(ARCH_64BIT) && defined(X86_SSE2)
81
    // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available.
82
1
    ft.adler32 = &adler32_c;
83
1
    ft.adler32_copy = &adler32_copy_c;
84
1
    ft.crc32 = &crc32_braid;
85
1
    ft.crc32_copy = &crc32_copy_braid;
86
1
#  endif
87
#else // WITH_ALL_FALLBACKS
88
    ft.adler32 = &adler32_c;
89
    ft.adler32_copy = &adler32_copy_c;
90
    ft.chunkmemset_safe = &chunkmemset_safe_c;
91
    ft.compare256 = &compare256_c;
92
    ft.crc32 = &crc32_braid;
93
    ft.crc32_copy = &crc32_copy_braid;
94
    ft.inflate_fast = &inflate_fast_c;
95
    ft.longest_match = &longest_match_c;
96
    ft.longest_match_slow = &longest_match_slow_c;
97
    ft.slide_hash = &slide_hash_c;
98
#endif
99
100
    // Select arch-optimized functions
101
1
#ifdef WITH_OPTIM
102
103
    // Chorba generic C fallback
104
1
#ifndef WITHOUT_CHORBA
105
1
    ft.crc32 = &crc32_chorba;
106
1
    ft.crc32_copy = &crc32_copy_chorba;
107
1
#endif
108
109
    // X86 - SSE2
110
1
#ifdef X86_SSE2
111
#  ifdef ARCH_32BIT
112
    if (cf.x86.has_sse2)
113
#  endif
114
1
    {
115
1
        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
116
1
        ft.compare256 = &compare256_sse2;
117
1
#  if !defined(WITHOUT_CHORBA_SSE)
118
1
        ft.crc32 = &crc32_chorba_sse2;
119
1
        ft.crc32_copy = &crc32_copy_chorba_sse2;
120
1
#  endif
121
1
        ft.inflate_fast = &inflate_fast_sse2;
122
1
        ft.longest_match = &longest_match_sse2;
123
1
        ft.longest_match_slow = &longest_match_slow_sse2;
124
1
        ft.slide_hash = &slide_hash_sse2;
125
1
    }
126
1
#endif
127
    // X86 - SSSE3
128
1
#ifdef X86_SSSE3
129
1
    if (cf.x86.has_ssse3) {
130
1
        ft.adler32 = &adler32_ssse3;
131
1
        ft.adler32_copy = &adler32_copy_ssse3;
132
1
        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
133
1
        ft.inflate_fast = &inflate_fast_ssse3;
134
1
    }
135
1
#endif
136
137
    // X86 - SSE4.1
138
1
#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE)
139
1
    if (cf.x86.has_sse41) {
140
1
        ft.crc32 = &crc32_chorba_sse41;
141
1
        ft.crc32_copy = &crc32_copy_chorba_sse41;
142
1
    }
143
1
#endif
144
145
    // X86 - SSE4.2
146
1
#ifdef X86_SSE42
147
1
    if (cf.x86.has_sse42) {
148
1
        ft.adler32_copy = &adler32_copy_sse42;
149
1
    }
150
1
#endif
151
    // X86 - PCLMUL
152
1
#ifdef X86_PCLMULQDQ_CRC
153
1
    if (cf.x86.has_pclmulqdq) {
154
1
        ft.crc32 = &crc32_pclmulqdq;
155
1
        ft.crc32_copy = &crc32_copy_pclmulqdq;
156
1
    }
157
1
#endif
158
    // X86 - AVX
159
1
#ifdef X86_AVX2
160
    /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
161
     * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
162
     * for the shift results as an operand, eliminating several register-register moves when the original value needs
163
     * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
164
1
    if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
165
1
        ft.adler32 = &adler32_avx2;
166
1
        ft.adler32_copy = &adler32_copy_avx2;
167
1
        ft.chunkmemset_safe = &chunkmemset_safe_avx2;
168
1
        ft.compare256 = &compare256_avx2;
169
1
        ft.inflate_fast = &inflate_fast_avx2;
170
1
        ft.longest_match = &longest_match_avx2;
171
1
        ft.longest_match_slow = &longest_match_slow_avx2;
172
1
        ft.slide_hash = &slide_hash_avx2;
173
1
    }
174
1
#endif
175
    // X86 - AVX512 (F,DQ,BW,Vl)
176
1
#ifdef X86_AVX512
177
1
    if (cf.x86.has_avx512_common) {
178
0
        ft.adler32 = &adler32_avx512;
179
0
        ft.adler32_copy = &adler32_copy_avx512;
180
0
        ft.chunkmemset_safe = &chunkmemset_safe_avx512;
181
0
        ft.compare256 = &compare256_avx512;
182
0
        ft.inflate_fast = &inflate_fast_avx512;
183
0
        ft.longest_match = &longest_match_avx512;
184
0
        ft.longest_match_slow = &longest_match_slow_avx512;
185
0
    }
186
1
#endif
187
1
#ifdef X86_AVX512VNNI
188
1
    if (cf.x86.has_avx512vnni) {
189
0
        ft.adler32 = &adler32_avx512_vnni;
190
0
        ft.adler32_copy = &adler32_copy_avx512_vnni;
191
0
    }
192
1
#endif
193
    // X86 - VPCLMULQDQ
194
1
#ifdef X86_VPCLMULQDQ_CRC
195
1
    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
196
0
        ft.crc32 = &crc32_vpclmulqdq;
197
0
        ft.crc32_copy = &crc32_copy_vpclmulqdq;
198
0
    }
199
1
#endif
200
201
202
    // ARM - SIMD
203
#ifdef ARM_SIMD
204
#  ifndef ARM_NOCHECK_SIMD
205
    if (cf.arm.has_simd)
206
#  endif
207
    {
208
        ft.slide_hash = &slide_hash_armv6;
209
    }
210
#endif
211
    // ARM - NEON
212
#ifdef ARM_NEON
213
#  ifndef ARM_NOCHECK_NEON
214
    if (cf.arm.has_neon)
215
#  endif
216
    {
217
        ft.adler32 = &adler32_neon;
218
        ft.adler32_copy = &adler32_copy_neon;
219
        ft.chunkmemset_safe = &chunkmemset_safe_neon;
220
        ft.compare256 = &compare256_neon;
221
        ft.inflate_fast = &inflate_fast_neon;
222
        ft.longest_match = &longest_match_neon;
223
        ft.longest_match_slow = &longest_match_slow_neon;
224
        ft.slide_hash = &slide_hash_neon;
225
    }
226
#endif
227
    // ARM - CRC32
228
#ifdef ARM_CRC32
229
    if (cf.arm.has_crc32) {
230
        ft.crc32 = &crc32_armv8;
231
        ft.crc32_copy = &crc32_copy_armv8;
232
    }
233
#endif
234
    // ARM - PMULL EOR3
235
#ifdef ARM_PMULL_EOR3
236
    if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) {
237
        ft.crc32 = &crc32_armv8_pmull_eor3;
238
        ft.crc32_copy = &crc32_copy_armv8_pmull_eor3;
239
    }
240
#endif
241
242
    // Power - VMX
243
#ifdef PPC_VMX
244
    if (cf.power.has_altivec) {
245
        ft.adler32 = &adler32_vmx;
246
        ft.adler32_copy = &adler32_copy_vmx;
247
        ft.slide_hash = &slide_hash_vmx;
248
    }
249
#endif
250
    // Power8 - VSX
251
#ifdef POWER8_VSX
252
    if (cf.power.has_arch_2_07) {
253
        ft.adler32 = &adler32_power8;
254
        ft.adler32_copy = &adler32_copy_power8;
255
        ft.chunkmemset_safe = &chunkmemset_safe_power8;
256
        ft.inflate_fast = &inflate_fast_power8;
257
        ft.slide_hash = &slide_hash_power8;
258
    }
259
#endif
260
#ifdef POWER8_VSX_CRC32
261
    if (cf.power.has_arch_2_07) {
262
        ft.crc32 = &crc32_power8;
263
        ft.crc32_copy = &crc32_copy_power8;
264
    }
265
#endif
266
    // Power9
267
#ifdef POWER9
268
    if (cf.power.has_arch_3_00) {
269
        ft.compare256 = &compare256_power9;
270
        ft.longest_match = &longest_match_power9;
271
        ft.longest_match_slow = &longest_match_slow_power9;
272
    }
273
#endif
274
275
276
    // RISCV - RVV
277
#ifdef RISCV_RVV
278
    if (cf.riscv.has_rvv) {
279
        ft.adler32 = &adler32_rvv;
280
        ft.adler32_copy = &adler32_copy_rvv;
281
        ft.chunkmemset_safe = &chunkmemset_safe_rvv;
282
        ft.compare256 = &compare256_rvv;
283
        ft.inflate_fast = &inflate_fast_rvv;
284
        ft.longest_match = &longest_match_rvv;
285
        ft.longest_match_slow = &longest_match_slow_rvv;
286
        ft.slide_hash = &slide_hash_rvv;
287
    }
288
#endif
289
290
    // RISCV - ZBC
291
#ifdef RISCV_CRC32_ZBC
292
    if (cf.riscv.has_zbc) {
293
        ft.crc32 = &crc32_riscv64_zbc;
294
        ft.crc32_copy = &crc32_copy_riscv64_zbc;
295
    }
296
#endif
297
298
    // S390
299
#ifdef S390_CRC32_VX
300
    if (cf.s390.has_vx) {
301
        ft.crc32 = crc32_s390_vx;
302
        ft.crc32_copy = crc32_copy_s390_vx;
303
    }
304
#endif
305
306
    // LOONGARCH
307
#ifdef LOONGARCH_CRC
308
    if (cf.loongarch.has_crc) {
309
        ft.crc32 = crc32_loongarch64;
310
        ft.crc32_copy = crc32_copy_loongarch64;
311
    }
312
#endif
313
#ifdef LOONGARCH_LSX
314
    if (cf.loongarch.has_lsx) {
315
        ft.adler32 = &adler32_lsx;
316
        ft.adler32_copy = &adler32_copy_lsx;
317
        ft.chunkmemset_safe = &chunkmemset_safe_lsx;
318
        ft.compare256 = &compare256_lsx;
319
        ft.inflate_fast = &inflate_fast_lsx;
320
        ft.longest_match = &longest_match_lsx;
321
        ft.longest_match_slow = &longest_match_slow_lsx;
322
        ft.slide_hash = slide_hash_lsx;
323
    }
324
#endif
325
#ifdef LOONGARCH_LASX
326
    if (cf.loongarch.has_lasx) {
327
        ft.adler32 = &adler32_lasx;
328
        ft.adler32_copy = &adler32_copy_lasx;
329
        ft.chunkmemset_safe = &chunkmemset_safe_lasx;
330
        ft.compare256 = &compare256_lasx;
331
        ft.inflate_fast = &inflate_fast_lasx;
332
        ft.longest_match = &longest_match_lasx;
333
        ft.longest_match_slow = &longest_match_slow_lasx;
334
        ft.slide_hash = slide_hash_lasx;
335
    }
336
#endif
337
338
1
#endif // WITH_OPTIM
339
340
    // Assign function pointers individually for atomic operation
341
1
    FUNCTABLE_ASSIGN(ft, force_init);
342
1
    FUNCTABLE_VERIFY_ASSIGN(ft, adler32);
343
1
    FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy);
344
1
    FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe);
345
1
    FUNCTABLE_VERIFY_ASSIGN(ft, compare256);
346
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32);
347
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy);
348
1
    FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast);
349
1
    FUNCTABLE_VERIFY_ASSIGN(ft, longest_match);
350
1
    FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow);
351
1
    FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash);
352
353
    // Memory barrier for weak memory order CPUs
354
1
    FUNCTABLE_BARRIER();
355
356
1
    return Z_OK;
357
1
}
358
359
/* stub functions */
360
1
static int force_init_stub(void) {
361
1
    return init_functable();
362
1
}
363
364
0
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
365
0
    FUNCTABLE_INIT_ABORT;
366
0
    return functable.adler32(adler, buf, len);
367
0
}
368
369
0
static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
370
0
    FUNCTABLE_INIT_ABORT;
371
0
    return functable.adler32_copy(adler, dst, src, len);
372
0
}
373
374
0
static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
375
0
    FUNCTABLE_INIT_ABORT;
376
0
    return functable.chunkmemset_safe(out, from, len, left);
377
0
}
378
379
0
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
380
0
    FUNCTABLE_INIT_ABORT;
381
0
    return functable.compare256(src0, src1);
382
0
}
383
384
0
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
385
0
    FUNCTABLE_INIT_ABORT;
386
0
    return functable.crc32(crc, buf, len);
387
0
}
388
389
0
static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
390
0
    FUNCTABLE_INIT_ABORT;
391
0
    return functable.crc32_copy(crc, dst, src, len);
392
0
}
393
394
0
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
395
0
    FUNCTABLE_INIT_ABORT;
396
0
    functable.inflate_fast(strm, start);
397
0
}
398
399
0
static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) {
400
0
    FUNCTABLE_INIT_ABORT;
401
0
    return functable.longest_match(s, cur_match);
402
0
}
403
404
0
static uint32_t longest_match_slow_stub(deflate_state* const s, uint32_t cur_match) {
405
0
    FUNCTABLE_INIT_ABORT;
406
0
    return functable.longest_match_slow(s, cur_match);
407
0
}
408
409
0
static void slide_hash_stub(deflate_state* s) {
410
0
    FUNCTABLE_INIT_ABORT;
411
0
    functable.slide_hash(s);
412
0
}
413
414
/* functable init */
415
Z_INTERNAL struct functable_s functable = {
416
    force_init_stub,
417
    adler32_stub,
418
    adler32_copy_stub,
419
    chunkmemset_safe_stub,
420
    compare256_stub,
421
    crc32_stub,
422
    crc32_copy_stub,
423
    inflate_fast_stub,
424
    longest_match_stub,
425
    longest_match_slow_stub,
426
    slide_hash_stub,
427
};
428
429
#endif