Coverage Report

Created: 2026-05-28 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/functable.c
Line
Count
Source
1
/* functable.c -- Choose relevant optimized functions at runtime
2
 * Copyright (C) 2017 Hans Kristian Rosbach
3
 * For conditions of distribution and use, see copyright notice in zlib.h
4
 */
5
#ifndef DISABLE_RUNTIME_CPU_DETECTION
6
7
#include "zbuild.h"
8
9
#if defined(_MSC_VER)
10
#  include <intrin.h>
11
#endif
12
13
#include "functable.h"
14
#include "cpu_features.h"
15
#include "arch_functions.h"
16
17
/* Platform has pointer size atomic store */
18
#if defined(__GNUC__) || defined(__clang__)
19
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
20
22
    __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
21
2
#  define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
22
#elif defined(_MSC_VER)
23
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
24
    _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
25
#  ifdef ARCH_ARM
26
#    define FUNCTABLE_BARRIER() do { \
27
    _ReadWriteBarrier();  \
28
    __dmb(0xB); /* _ARM_BARRIER_ISH */ \
29
    _ReadWriteBarrier(); \
30
} while (0)
31
#  else
32
#    define FUNCTABLE_BARRIER() _ReadWriteBarrier()
33
#  endif
34
#else
35
#  warning Unable to detect atomic intrinsic support.
36
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
37
    *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
38
#  define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
39
#endif
40
41
/* Verify all pointers are valid before assigning, return 1 on failure
42
 * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR
43
 * if functable initialization fails.
44
 */
45
#define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \
46
20
    if (!VAR.FUNC_NAME) { \
47
0
        fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
48
0
        return 1; \
49
0
    } \
50
20
    FUNCTABLE_ASSIGN(VAR, FUNC_NAME);
51
52
/* Functable init & abort on failure.
53
 * Abort is needed because some stub functions are reachable without first
54
 * calling any inflateinit/deflateinit functions, and have no error propagation.
55
 */
56
#define FUNCTABLE_INIT_ABORT \
57
2
    if (init_functable()) { \
58
0
        fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
59
0
        abort(); \
60
2
    };
61
62
// Empty stub, used when functable has already been initialized
63
51.5k
static int force_init_empty(void) {
64
51.5k
    return 0;
65
51.5k
}
66
67
/* Functable initialization.
68
 * Selects the best available optimized functions appropriate for the runtime cpu.
69
 */
70
2
static int init_functable(void) {
71
2
    struct functable_s ft;
72
2
    struct cpu_features cf;
73
74
2
    memset(&ft, 0, sizeof(struct functable_s));
75
2
    cpu_check_features(&cf);
76
2
    ft.force_init = &force_init_empty;
77
78
    // Only use necessary generic functions when no suitable simd versions are available.
79
2
#ifdef ADLER32_FALLBACK
80
2
    ft.adler32 = &adler32_c;
81
2
    ft.adler32_copy = &adler32_copy_c;
82
2
#endif
83
#ifdef CHUNKSET_FALLBACK
84
    ft.chunkmemset_safe = &chunkmemset_safe_c;
85
    ft.inflate_fast = &inflate_fast_c;
86
#endif
87
#ifdef COMPARE256_FALLBACK
88
    ft.compare256 = &compare256_c;
89
    ft.longest_match = &longest_match_c;
90
    ft.longest_match_roll = &longest_match_roll_c;
91
#endif
92
2
#ifdef CRC32_BRAID_FALLBACK
93
2
    ft.crc32 = &crc32_braid;
94
2
    ft.crc32_copy = &crc32_copy_braid;
95
2
#endif
96
#ifdef SLIDE_HASH_FALLBACK
97
    ft.slide_hash = &slide_hash_c;
98
#endif
99
100
    // Select arch-optimized functions
101
2
#ifdef WITH_OPTIM
102
103
    // Chorba generic C fallback
104
2
#ifdef CRC32_CHORBA_FALLBACK
105
2
    ft.crc32 = &crc32_chorba;
106
2
    ft.crc32_copy = &crc32_copy_chorba;
107
2
#endif
108
109
    // X86 - SSE2
110
2
#ifdef X86_SSE2
111
#  ifndef X86_SSE2_NATIVE
112
    if (cf.x86.has_sse2)
113
#  endif
114
2
    {
115
2
#  ifndef X86_AVX2_NATIVE
116
2
        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
117
2
        ft.compare256 = &compare256_sse2;
118
2
        ft.inflate_fast = &inflate_fast_sse2;
119
2
        ft.longest_match = &longest_match_sse2;
120
2
        ft.longest_match_roll = &longest_match_roll_sse2;
121
2
        ft.slide_hash = &slide_hash_sse2;
122
2
#  endif
123
2
#  if defined(CRC32_CHORBA_SSE_FALLBACK) && !defined(X86_SSE41_NATIVE) && !defined(X86_PCLMULQDQ_NATIVE)
124
2
        ft.crc32 = &crc32_chorba_sse2;
125
2
        ft.crc32_copy = &crc32_copy_chorba_sse2;
126
2
#  endif
127
2
    }
128
2
#endif
129
    // X86 - SSSE3
130
2
#ifdef X86_SSSE3
131
2
#  ifndef X86_SSSE3_NATIVE
132
2
    if (cf.x86.has_ssse3)
133
2
#  endif
134
2
    {
135
2
        ft.adler32 = &adler32_ssse3;
136
2
        ft.adler32_copy = &adler32_copy_ssse3;
137
2
#  ifndef X86_AVX2_NATIVE
138
2
        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
139
2
        ft.inflate_fast = &inflate_fast_ssse3;
140
2
#  endif
141
2
    }
142
2
#endif
143
144
    // X86 - SSE4.1
145
2
#if defined(X86_SSE41) && !defined(X86_PCLMULQDQ_NATIVE)
146
2
#  ifndef X86_SSE41_NATIVE
147
2
    if (cf.x86.has_sse41)
148
2
#  endif
149
2
    {
150
2
#  ifdef CRC32_CHORBA_SSE_FALLBACK
151
2
        ft.crc32 = &crc32_chorba_sse41;
152
2
        ft.crc32_copy = &crc32_copy_chorba_sse41;
153
2
#  endif
154
2
    }
155
2
#endif
156
157
    // X86 - SSE4.2
158
2
#if defined(X86_SSE42) && !defined(X86_AVX512_NATIVE)
159
2
#  ifndef X86_SSE42_NATIVE
160
2
    if (cf.x86.has_sse42)
161
2
#  endif
162
2
    {
163
2
        ft.adler32_copy = &adler32_copy_sse42;
164
2
    }
165
2
#endif
166
    // X86 - PCLMUL
167
2
#if defined(X86_PCLMULQDQ_CRC) && !defined(X86_VPCLMULQDQ_NATIVE)
168
2
#  ifndef X86_PCLMULQDQ_NATIVE
169
2
    if (cf.x86.has_pclmulqdq)
170
2
#  endif
171
2
    {
172
2
        ft.crc32 = &crc32_pclmulqdq;
173
2
        ft.crc32_copy = &crc32_copy_pclmulqdq;
174
2
    }
175
2
#endif
176
    // X86 - AVX2
177
2
#ifdef X86_AVX2
178
    /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
179
     * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
180
     * for the shift results as an operand, eliminating several register-register moves when the original value needs
181
     * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
182
2
#  ifndef X86_AVX2_NATIVE
183
2
    if (cf.x86.has_avx2 && cf.x86.has_bmi2)
184
2
#  endif
185
2
    {
186
2
#  ifndef X86_AVX512_NATIVE
187
2
        ft.adler32 = &adler32_avx2;
188
2
        ft.adler32_copy = &adler32_copy_avx2;
189
2
        ft.chunkmemset_safe = &chunkmemset_safe_avx2;
190
2
        ft.compare256 = &compare256_avx2;
191
2
        ft.inflate_fast = &inflate_fast_avx2;
192
2
        ft.longest_match = &longest_match_avx2;
193
2
        ft.longest_match_roll = &longest_match_roll_avx2;
194
2
#  endif
195
2
        ft.slide_hash = &slide_hash_avx2;
196
2
    }
197
2
#endif
198
    // X86 - AVX512 (F,DQ,BW,Vl)
199
2
#ifdef X86_AVX512
200
2
#  ifndef X86_AVX512_NATIVE
201
2
    if (cf.x86.has_avx512_common)
202
0
#  endif
203
0
    {
204
0
#  ifndef X86_AVX512VNNI_NATIVE
205
0
        ft.adler32 = &adler32_avx512;
206
0
        ft.adler32_copy = &adler32_copy_avx512;
207
0
#  endif
208
0
        ft.chunkmemset_safe = &chunkmemset_safe_avx512;
209
0
        ft.compare256 = &compare256_avx512;
210
0
        ft.inflate_fast = &inflate_fast_avx512;
211
0
        ft.longest_match = &longest_match_avx512;
212
0
        ft.longest_match_roll = &longest_match_roll_avx512;
213
0
    }
214
2
#endif
215
2
#ifdef X86_AVX512VNNI
216
2
#  ifndef X86_AVX512VNNI_NATIVE
217
2
    if (cf.x86.has_avx512vnni)
218
0
#  endif
219
0
    {
220
0
        ft.adler32 = &adler32_avx512_vnni;
221
0
        ft.adler32_copy = &adler32_copy_avx512_vnni;
222
0
    }
223
2
#endif
224
    // X86 - VPCLMULQDQ (AVX2)
225
2
#ifdef X86_VPCLMULQDQ_AVX2
226
2
#  ifndef X86_VPCLMULQDQ_AVX2_NATIVE
227
2
    if (cf.x86.has_pclmulqdq && cf.x86.has_avx2 && cf.x86.has_vpclmulqdq)
228
0
#  endif
229
0
    {
230
0
        ft.crc32 = &crc32_vpclmulqdq_avx2;
231
0
        ft.crc32_copy = &crc32_copy_vpclmulqdq_avx2;
232
0
    }
233
2
#endif
234
    // X86 - VPCLMULQDQ (AVX-512)
235
2
#ifdef X86_VPCLMULQDQ_AVX512
236
2
#  ifndef X86_VPCLMULQDQ_AVX512_NATIVE
237
2
    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq)
238
0
#  endif
239
0
    {
240
0
        ft.crc32 = &crc32_vpclmulqdq_avx512;
241
0
        ft.crc32_copy = &crc32_copy_vpclmulqdq_avx512;
242
0
    }
243
2
#endif
244
245
246
    // ARM - SIMD
247
#if defined(ARM_SIMD) && !defined(ARM_NEON_NATIVE)
248
#  ifndef ARM_SIMD_NATIVE
249
    if (cf.arm.has_simd)
250
#  endif
251
    {
252
        ft.slide_hash = &slide_hash_armv6;
253
    }
254
#endif
255
    // ARM - NEON
256
#ifdef ARM_NEON
257
#  ifndef ARM_NEON_NATIVE
258
    if (cf.arm.has_neon)
259
#  endif
260
    {
261
        ft.adler32 = &adler32_neon;
262
        ft.adler32_copy = &adler32_copy_neon;
263
        ft.chunkmemset_safe = &chunkmemset_safe_neon;
264
        ft.compare256 = &compare256_neon;
265
        ft.inflate_fast = &inflate_fast_neon;
266
        ft.longest_match = &longest_match_neon;
267
        ft.longest_match_roll = &longest_match_roll_neon;
268
        ft.slide_hash = &slide_hash_neon;
269
    }
270
#endif
271
    // ARM - CRC32
272
#if defined(ARM_CRC32) && !defined(ARM_PMULL_EOR3_NATIVE)
273
#  ifndef ARM_CRC32_NATIVE
274
    if (cf.arm.has_crc32)
275
#  endif
276
    {
277
        ft.crc32 = &crc32_armv8;
278
        ft.crc32_copy = &crc32_copy_armv8;
279
    }
280
#endif
281
    // ARM - PMULL EOR3
282
#ifdef ARM_PMULL_EOR3
283
#  ifndef ARM_PMULL_EOR3_NATIVE
284
    if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull)
285
#  endif
286
    {
287
        ft.crc32 = &crc32_armv8_pmull_eor3;
288
        ft.crc32_copy = &crc32_copy_armv8_pmull_eor3;
289
    }
290
#endif
291
292
    // Power - VMX
293
#ifdef PPC_VMX
294
#  ifndef PPC_VMX_NATIVE
295
    if (cf.power.has_altivec)
296
#  endif
297
    {
298
        ft.adler32 = &adler32_vmx;
299
        ft.adler32_copy = &adler32_copy_vmx;
300
        ft.slide_hash = &slide_hash_vmx;
301
    }
302
#endif
303
    // Power8 - VSX
304
#ifdef POWER8_VSX
305
#  ifndef POWER8_VSX_NATIVE
306
    if (cf.power.has_arch_2_07)
307
#  endif
308
    {
309
        ft.adler32 = &adler32_power8;
310
        ft.adler32_copy = &adler32_copy_power8;
311
        ft.chunkmemset_safe = &chunkmemset_safe_power8;
312
        ft.inflate_fast = &inflate_fast_power8;
313
        ft.slide_hash = &slide_hash_power8;
314
    }
315
#endif
316
#ifdef POWER8_VSX_CRC32
317
#  ifndef POWER8_VSX_CRC32_NATIVE
318
    if (cf.power.has_arch_2_07)
319
#  endif
320
    {
321
        ft.crc32 = &crc32_power8;
322
        ft.crc32_copy = &crc32_copy_power8;
323
    }
324
#endif
325
    // Power9
326
#ifdef POWER9
327
#  ifndef POWER9_NATIVE
328
    if (cf.power.has_arch_3_00)
329
#  endif
330
    {
331
        ft.compare256 = &compare256_power9;
332
        ft.longest_match = &longest_match_power9;
333
        ft.longest_match_roll = &longest_match_roll_power9;
334
    }
335
#endif
336
337
338
    // RISCV - RVV
339
#ifdef RISCV_RVV
340
#  ifndef RISCV_RVV_NATIVE
341
    if (cf.riscv.has_rvv)
342
#  endif
343
    {
344
        ft.adler32 = &adler32_rvv;
345
        ft.adler32_copy = &adler32_copy_rvv;
346
        ft.chunkmemset_safe = &chunkmemset_safe_rvv;
347
        ft.compare256 = &compare256_rvv;
348
        ft.inflate_fast = &inflate_fast_rvv;
349
        ft.longest_match = &longest_match_rvv;
350
        ft.longest_match_roll = &longest_match_roll_rvv;
351
        ft.slide_hash = &slide_hash_rvv;
352
    }
353
#endif
354
355
    // RISCV - ZBC
356
#ifdef RISCV_CRC32_ZBC
357
#  ifndef RISCV_ZBC_NATIVE
358
    if (cf.riscv.has_zbc)
359
#  endif
360
    {
361
        ft.crc32 = &crc32_riscv64_zbc;
362
        ft.crc32_copy = &crc32_copy_riscv64_zbc;
363
    }
364
#endif
365
366
    // S390
367
#ifdef S390_VX
368
#  ifndef S390_VX_NATIVE
369
    if (cf.s390.has_vx)
370
#  endif
371
    {
372
        ft.crc32 = &crc32_s390_vx;
373
        ft.crc32_copy = &crc32_copy_s390_vx;
374
        ft.slide_hash = &slide_hash_vx;
375
    }
376
#endif
377
378
    // LOONGARCH
379
#ifdef LOONGARCH_CRC
380
#  ifndef LOONGARCH_CRC_NATIVE
381
    if (cf.loongarch.has_crc)
382
#  endif
383
    {
384
        ft.crc32 = &crc32_loongarch64;
385
        ft.crc32_copy = &crc32_copy_loongarch64;
386
    }
387
#endif
388
#if defined(LOONGARCH_LSX) && !defined(LOONGARCH_LASX_NATIVE)
389
#  ifndef LOONGARCH_LSX_NATIVE
390
    if (cf.loongarch.has_lsx)
391
#  endif
392
    {
393
        ft.adler32 = &adler32_lsx;
394
        ft.adler32_copy = &adler32_copy_lsx;
395
        ft.chunkmemset_safe = &chunkmemset_safe_lsx;
396
        ft.compare256 = &compare256_lsx;
397
        ft.inflate_fast = &inflate_fast_lsx;
398
        ft.longest_match = &longest_match_lsx;
399
        ft.longest_match_roll = &longest_match_roll_lsx;
400
        ft.slide_hash = &slide_hash_lsx;
401
    }
402
#endif
403
#ifdef LOONGARCH_LASX
404
#  ifndef LOONGARCH_LASX_NATIVE
405
    if (cf.loongarch.has_lasx)
406
#  endif
407
    {
408
        ft.adler32 = &adler32_lasx;
409
        ft.adler32_copy = &adler32_copy_lasx;
410
        ft.chunkmemset_safe = &chunkmemset_safe_lasx;
411
        ft.compare256 = &compare256_lasx;
412
        ft.inflate_fast = &inflate_fast_lasx;
413
        ft.longest_match = &longest_match_lasx;
414
        ft.longest_match_roll = &longest_match_roll_lasx;
415
        ft.slide_hash = &slide_hash_lasx;
416
    }
417
#endif
418
419
2
#endif // WITH_OPTIM
420
421
    // Assign function pointers individually for atomic operation
422
2
    FUNCTABLE_ASSIGN(ft, force_init);
423
2
    FUNCTABLE_VERIFY_ASSIGN(ft, adler32);
424
2
    FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy);
425
2
    FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe);
426
2
    FUNCTABLE_VERIFY_ASSIGN(ft, compare256);
427
2
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32);
428
2
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy);
429
2
    FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast);
430
2
    FUNCTABLE_VERIFY_ASSIGN(ft, longest_match);
431
2
    FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_roll);
432
2
    FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash);
433
434
    // Memory barrier for weak memory order CPUs
435
2
    FUNCTABLE_BARRIER();
436
437
2
    return Z_OK;
438
2
}
439
440
#if !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__))
441
2
static void __attribute__((constructor)) functable_constructor(void) {
442
2
    FUNCTABLE_INIT_ABORT;
443
2
}
444
#endif
445
446
/* stub functions */
447
0
static int force_init_stub(void) {
448
0
    return init_functable();
449
0
}
450
451
0
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
452
0
    FUNCTABLE_INIT_ABORT;
453
0
    return functable.adler32(adler, buf, len);
454
0
}
455
456
0
static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
457
0
    FUNCTABLE_INIT_ABORT;
458
0
    return functable.adler32_copy(adler, dst, src, len);
459
0
}
460
461
0
static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, size_t len, size_t left) {
462
0
    FUNCTABLE_INIT_ABORT;
463
0
    return functable.chunkmemset_safe(out, from, len, left);
464
0
}
465
466
0
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
467
0
    FUNCTABLE_INIT_ABORT;
468
0
    return functable.compare256(src0, src1);
469
0
}
470
471
0
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
472
0
    FUNCTABLE_INIT_ABORT;
473
0
    return functable.crc32(crc, buf, len);
474
0
}
475
476
0
static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
477
0
    FUNCTABLE_INIT_ABORT;
478
0
    return functable.crc32_copy(crc, dst, src, len);
479
0
}
480
481
0
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start, int safe_mode) {
482
0
    FUNCTABLE_INIT_ABORT;
483
0
    functable.inflate_fast(strm, start, safe_mode);
484
0
}
485
486
0
static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) {
487
0
    FUNCTABLE_INIT_ABORT;
488
0
    return functable.longest_match(s, cur_match);
489
0
}
490
491
0
static uint32_t longest_match_roll_stub(deflate_state* const s, uint32_t cur_match) {
492
0
    FUNCTABLE_INIT_ABORT;
493
0
    return functable.longest_match_roll(s, cur_match);
494
0
}
495
496
0
static void slide_hash_stub(deflate_state* s) {
497
0
    FUNCTABLE_INIT_ABORT;
498
0
    functable.slide_hash(s);
499
0
}
500
501
/* functable init */
502
Z_INTERNAL struct functable_s functable = {
503
    force_init_stub,
504
    adler32_stub,
505
    adler32_copy_stub,
506
    chunkmemset_safe_stub,
507
    compare256_stub,
508
    crc32_stub,
509
    crc32_copy_stub,
510
    inflate_fast_stub,
511
    longest_match_stub,
512
    longest_match_roll_stub,
513
    slide_hash_stub,
514
};
515
516
#endif