Coverage Report

Created: 2025-10-23 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/zlib-ng/functable.c
Line
Count
Source
1
/* functable.c -- Choose relevant optimized functions at runtime
2
 * Copyright (C) 2017 Hans Kristian Rosbach
3
 * For conditions of distribution and use, see copyright notice in zlib.h
4
 */
5
#ifndef DISABLE_RUNTIME_CPU_DETECTION
6
7
#include "zbuild.h"
8
9
#if defined(_MSC_VER)
10
#  include <intrin.h>
11
#endif
12
13
#include "functable.h"
14
#include "cpu_features.h"
15
#include "arch_functions.h"
16
17
/* Platform has pointer size atomic store */
18
#if defined(__GNUC__) || defined(__clang__)
19
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
20
14
    __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
21
1
#  define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
22
#elif defined(_MSC_VER)
23
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
24
    _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
25
#  if defined(_M_ARM) || defined(_M_ARM64)
26
#    define FUNCTABLE_BARRIER() do { \
27
    _ReadWriteBarrier();  \
28
    __dmb(0xB); /* _ARM_BARRIER_ISH */ \
29
    _ReadWriteBarrier(); \
30
} while (0)
31
#  else
32
#    define FUNCTABLE_BARRIER() _ReadWriteBarrier()
33
#  endif
34
#else
35
#  warning Unable to detect atomic intrinsic support.
36
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
37
    *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
38
#  define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
39
#endif
40
41
/* Verify all pointers are valid before assigning, return 1 on failure
42
 * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR
43
 * if functable initialization fails.
44
 */
45
#define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \
46
13
    if (!VAR.FUNC_NAME) { \
47
0
        fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
48
0
        return 1; \
49
0
    } \
50
13
    FUNCTABLE_ASSIGN(VAR, FUNC_NAME);
51
52
/* Functable init & abort on failure.
53
 * Abort is needed because some stub functions are reachable without first
54
 * calling any inflateinit/deflateinit functions, and have no error propagation.
55
 */
56
#define FUNCTABLE_INIT_ABORT \
57
0
    if (init_functable()) { \
58
0
        fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \
59
0
        abort(); \
60
0
    };
61
62
// Empty stub, used when functable has already been initialized
63
5.97k
static int force_init_empty(void) {
64
5.97k
    return 0;
65
5.97k
}
66
67
/* Functable initialization.
68
 * Selects the best available optimized functions appropriate for the runtime cpu.
69
 */
70
1
static int init_functable(void) {
71
1
    struct functable_s ft;
72
1
    struct cpu_features cf;
73
74
1
    cpu_check_features(&cf);
75
1
    ft.force_init = &force_init_empty;
76
77
    // Set up generic C code fallbacks
78
1
#ifndef WITH_ALL_FALLBACKS
79
1
#  if (defined(__x86_64__) || defined(_M_X64)) && defined(X86_SSE2)
80
    // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available.
81
1
    ft.adler32 = &adler32_c;
82
1
    ft.adler32_fold_copy = &adler32_fold_copy_c;
83
1
    ft.crc32 = &crc32_c;
84
1
    ft.crc32_fold = &crc32_fold_c;
85
1
    ft.crc32_fold_copy = &crc32_fold_copy_c;
86
1
    ft.crc32_fold_final = &crc32_fold_final_c;
87
1
    ft.crc32_fold_reset = &crc32_fold_reset_c;
88
#    ifndef HAVE_BUILTIN_CTZ
89
    ft.longest_match = &longest_match_c;
90
    ft.longest_match_slow = &longest_match_slow_c;
91
    ft.compare256 = &compare256_c;
92
#    endif
93
1
#  endif
94
#else // WITH_ALL_FALLBACKS
95
    ft.adler32 = &adler32_c;
96
    ft.adler32_fold_copy = &adler32_fold_copy_c;
97
    ft.chunkmemset_safe = &chunkmemset_safe_c;
98
    ft.crc32 = &crc32_c;
99
    ft.crc32_fold = &crc32_fold_c;
100
    ft.crc32_fold_copy = &crc32_fold_copy_c;
101
    ft.crc32_fold_final = &crc32_fold_final_c;
102
    ft.crc32_fold_reset = &crc32_fold_reset_c;
103
    ft.inflate_fast = &inflate_fast_c;
104
    ft.slide_hash = &slide_hash_c;
105
    ft.longest_match = &longest_match_c;
106
    ft.longest_match_slow = &longest_match_slow_c;
107
    ft.compare256 = &compare256_c;
108
#endif
109
110
    // Select arch-optimized functions
111
1
#ifdef WITH_OPTIM
112
113
    // X86 - SSE2
114
1
#ifdef X86_SSE2
115
#  if !defined(__x86_64__) && !defined(_M_X64)
116
    if (cf.x86.has_sse2)
117
#  endif
118
1
    {
119
1
        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
120
1
#  if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
121
1
        ft.crc32 = &crc32_chorba_sse2;
122
1
#  endif
123
1
        ft.inflate_fast = &inflate_fast_sse2;
124
1
        ft.slide_hash = &slide_hash_sse2;
125
1
#  ifdef HAVE_BUILTIN_CTZ
126
1
        ft.compare256 = &compare256_sse2;
127
1
        ft.longest_match = &longest_match_sse2;
128
1
        ft.longest_match_slow = &longest_match_slow_sse2;
129
1
#  endif
130
1
    }
131
1
#endif
132
    // X86 - SSSE3
133
1
#ifdef X86_SSSE3
134
1
    if (cf.x86.has_ssse3) {
135
1
        ft.adler32 = &adler32_ssse3;
136
1
        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
137
1
        ft.inflate_fast = &inflate_fast_ssse3;
138
1
    }
139
1
#endif
140
141
    // X86 - SSE4.1
142
1
#ifdef X86_SSE41
143
1
    if (cf.x86.has_sse41) {
144
1
#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
145
1
        ft.crc32 = &crc32_chorba_sse41;
146
1
#endif
147
1
    }
148
1
#endif
149
150
    // X86 - SSE4.2
151
1
#ifdef X86_SSE42
152
1
    if (cf.x86.has_sse42) {
153
1
        ft.adler32_fold_copy = &adler32_fold_copy_sse42;
154
1
    }
155
1
#endif
156
    // X86 - PCLMUL
157
1
#ifdef X86_PCLMULQDQ_CRC
158
1
    if (cf.x86.has_pclmulqdq) {
159
1
        ft.crc32 = &crc32_pclmulqdq;
160
1
        ft.crc32_fold = &crc32_fold_pclmulqdq;
161
1
        ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
162
1
        ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
163
1
        ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
164
1
    }
165
1
#endif
166
    // X86 - AVX
167
1
#ifdef X86_AVX2
168
    /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
169
     * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
170
     * for the shift results as an operand, eliminating several register-register moves when the original value needs
171
     * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
172
1
    if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
173
1
        ft.adler32 = &adler32_avx2;
174
1
        ft.adler32_fold_copy = &adler32_fold_copy_avx2;
175
1
        ft.chunkmemset_safe = &chunkmemset_safe_avx2;
176
1
        ft.inflate_fast = &inflate_fast_avx2;
177
1
        ft.slide_hash = &slide_hash_avx2;
178
1
#  ifdef HAVE_BUILTIN_CTZ
179
1
        ft.compare256 = &compare256_avx2;
180
1
        ft.longest_match = &longest_match_avx2;
181
1
        ft.longest_match_slow = &longest_match_slow_avx2;
182
1
#  endif
183
1
    }
184
1
#endif
185
    // X86 - AVX512 (F,DQ,BW,Vl)
186
1
#ifdef X86_AVX512
187
1
    if (cf.x86.has_avx512_common) {
188
0
        ft.adler32 = &adler32_avx512;
189
0
        ft.adler32_fold_copy = &adler32_fold_copy_avx512;
190
0
        ft.chunkmemset_safe = &chunkmemset_safe_avx512;
191
0
        ft.inflate_fast = &inflate_fast_avx512;
192
0
#  ifdef HAVE_BUILTIN_CTZLL
193
0
        ft.compare256 = &compare256_avx512;
194
0
        ft.longest_match = &longest_match_avx512;
195
0
        ft.longest_match_slow = &longest_match_slow_avx512;
196
0
#  endif
197
0
    }
198
1
#endif
199
1
#ifdef X86_AVX512VNNI
200
1
    if (cf.x86.has_avx512vnni) {
201
0
        ft.adler32 = &adler32_avx512_vnni;
202
0
        ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
203
0
    }
204
1
#endif
205
    // X86 - VPCLMULQDQ
206
1
#ifdef X86_VPCLMULQDQ_CRC
207
1
    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
208
0
        ft.crc32 = &crc32_vpclmulqdq;
209
0
        ft.crc32_fold = &crc32_fold_vpclmulqdq;
210
0
        ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
211
0
        ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
212
0
        ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
213
0
    }
214
1
#endif
215
216
217
    // ARM - SIMD
218
#ifdef ARM_SIMD
219
#  ifndef ARM_NOCHECK_SIMD
220
    if (cf.arm.has_simd)
221
#  endif
222
    {
223
        ft.slide_hash = &slide_hash_armv6;
224
    }
225
#endif
226
    // ARM - NEON
227
#ifdef ARM_NEON
228
#  ifndef ARM_NOCHECK_NEON
229
    if (cf.arm.has_neon)
230
#  endif
231
    {
232
        ft.adler32 = &adler32_neon;
233
        ft.adler32_fold_copy = &adler32_fold_copy_neon;
234
        ft.chunkmemset_safe = &chunkmemset_safe_neon;
235
        ft.inflate_fast = &inflate_fast_neon;
236
        ft.slide_hash = &slide_hash_neon;
237
#  ifdef HAVE_BUILTIN_CTZLL
238
        ft.compare256 = &compare256_neon;
239
        ft.longest_match = &longest_match_neon;
240
        ft.longest_match_slow = &longest_match_slow_neon;
241
#  endif
242
    }
243
#endif
244
    // ARM - CRC32
245
#ifdef ARM_CRC32
246
    if (cf.arm.has_crc32) {
247
        ft.crc32 = &crc32_armv8;
248
        ft.crc32_fold = &crc32_fold_armv8;
249
        ft.crc32_fold_copy = &crc32_fold_copy_armv8;
250
    }
251
#endif
252
253
254
    // Power - VMX
255
#ifdef PPC_VMX
256
    if (cf.power.has_altivec) {
257
        ft.adler32 = &adler32_vmx;
258
        ft.slide_hash = &slide_hash_vmx;
259
    }
260
#endif
261
    // Power8 - VSX
262
#ifdef POWER8_VSX
263
    if (cf.power.has_arch_2_07) {
264
        ft.adler32 = &adler32_power8;
265
        ft.chunkmemset_safe = &chunkmemset_safe_power8;
266
        ft.inflate_fast = &inflate_fast_power8;
267
        ft.slide_hash = &slide_hash_power8;
268
    }
269
#endif
270
#ifdef POWER8_VSX_CRC32
271
    if (cf.power.has_arch_2_07)
272
        ft.crc32 = &crc32_power8;
273
#endif
274
    // Power9
275
#ifdef POWER9
276
    if (cf.power.has_arch_3_00) {
277
        ft.compare256 = &compare256_power9;
278
        ft.longest_match = &longest_match_power9;
279
        ft.longest_match_slow = &longest_match_slow_power9;
280
    }
281
#endif
282
283
284
    // RISCV - RVV
285
#ifdef RISCV_RVV
286
    if (cf.riscv.has_rvv) {
287
        ft.adler32 = &adler32_rvv;
288
        ft.adler32_fold_copy = &adler32_fold_copy_rvv;
289
        ft.chunkmemset_safe = &chunkmemset_safe_rvv;
290
        ft.compare256 = &compare256_rvv;
291
        ft.inflate_fast = &inflate_fast_rvv;
292
        ft.longest_match = &longest_match_rvv;
293
        ft.longest_match_slow = &longest_match_slow_rvv;
294
        ft.slide_hash = &slide_hash_rvv;
295
    }
296
#endif
297
298
    // RISCV - ZBC
299
#ifdef RISCV_CRC32_ZBC
300
    if (cf.riscv.has_zbc) {
301
        ft.crc32 = &crc32_riscv64_zbc;
302
    }
303
#endif
304
305
    // S390
306
#ifdef S390_CRC32_VX
307
    if (cf.s390.has_vx)
308
        ft.crc32 = crc32_s390_vx;
309
#endif
310
311
    // LOONGARCH
312
#ifdef LOONGARCH_CRC
313
    if (cf.loongarch.has_crc) {
314
        ft.crc32 = crc32_loongarch64;
315
        ft.crc32_fold = &crc32_fold_loongarch64;
316
        ft.crc32_fold_copy = &crc32_fold_copy_loongarch64;
317
    }
318
#endif
319
#ifdef LOONGARCH_LSX
320
    if (cf.loongarch.has_lsx) {
321
        ft.adler32 = &adler32_lsx;
322
        ft.adler32_fold_copy = &adler32_fold_copy_lsx;
323
        ft.slide_hash = slide_hash_lsx;
324
#  ifdef HAVE_BUILTIN_CTZ
325
        ft.compare256 = &compare256_lsx;
326
        ft.longest_match = &longest_match_lsx;
327
        ft.longest_match_slow = &longest_match_slow_lsx;
328
#  endif
329
        ft.chunkmemset_safe = &chunkmemset_safe_lsx;
330
        ft.inflate_fast = &inflate_fast_lsx;
331
    }
332
#endif
333
#ifdef LOONGARCH_LASX
334
    if (cf.loongarch.has_lasx) {
335
        ft.adler32 = &adler32_lasx;
336
        ft.adler32_fold_copy = &adler32_fold_copy_lasx;
337
        ft.slide_hash = slide_hash_lasx;
338
#  ifdef HAVE_BUILTIN_CTZ
339
        ft.compare256 = &compare256_lasx;
340
        ft.longest_match = &longest_match_lasx;
341
        ft.longest_match_slow = &longest_match_slow_lasx;
342
#  endif
343
        ft.chunkmemset_safe = &chunkmemset_safe_lasx;
344
        ft.inflate_fast = &inflate_fast_lasx;
345
    }
346
#endif
347
348
1
#endif // WITH_OPTIM
349
350
    // Assign function pointers individually for atomic operation
351
1
    FUNCTABLE_ASSIGN(ft, force_init);
352
1
    FUNCTABLE_VERIFY_ASSIGN(ft, adler32);
353
1
    FUNCTABLE_VERIFY_ASSIGN(ft, adler32_fold_copy);
354
1
    FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe);
355
1
    FUNCTABLE_VERIFY_ASSIGN(ft, compare256);
356
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32);
357
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32_fold);
358
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32_fold_copy);
359
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32_fold_final);
360
1
    FUNCTABLE_VERIFY_ASSIGN(ft, crc32_fold_reset);
361
1
    FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast);
362
1
    FUNCTABLE_VERIFY_ASSIGN(ft, longest_match);
363
1
    FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow);
364
1
    FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash);
365
366
    // Memory barrier for weak memory order CPUs
367
1
    FUNCTABLE_BARRIER();
368
369
1
    return Z_OK;
370
1
}
371
372
/* stub functions */
373
1
static int force_init_stub(void) {
374
1
    return init_functable();
375
1
}
376
377
0
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
378
0
    FUNCTABLE_INIT_ABORT;
379
0
    return functable.adler32(adler, buf, len);
380
0
}
381
382
0
static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
383
0
    FUNCTABLE_INIT_ABORT;
384
0
    return functable.adler32_fold_copy(adler, dst, src, len);
385
0
}
386
387
0
static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
388
0
    FUNCTABLE_INIT_ABORT;
389
0
    return functable.chunkmemset_safe(out, from, len, left);
390
0
}
391
392
0
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
393
0
    FUNCTABLE_INIT_ABORT;
394
0
    return functable.compare256(src0, src1);
395
0
}
396
397
0
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
398
0
    FUNCTABLE_INIT_ABORT;
399
0
    return functable.crc32(crc, buf, len);
400
0
}
401
402
0
static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
403
0
    FUNCTABLE_INIT_ABORT;
404
0
    functable.crc32_fold(crc, src, len, init_crc);
405
0
}
406
407
0
static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
408
0
    FUNCTABLE_INIT_ABORT;
409
0
    functable.crc32_fold_copy(crc, dst, src, len);
410
0
}
411
412
0
static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
413
0
    FUNCTABLE_INIT_ABORT;
414
0
    return functable.crc32_fold_final(crc);
415
0
}
416
417
0
static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
418
0
    FUNCTABLE_INIT_ABORT;
419
0
    return functable.crc32_fold_reset(crc);
420
0
}
421
422
0
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
423
0
    FUNCTABLE_INIT_ABORT;
424
0
    functable.inflate_fast(strm, start);
425
0
}
426
427
0
static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
428
0
    FUNCTABLE_INIT_ABORT;
429
0
    return functable.longest_match(s, cur_match);
430
0
}
431
432
0
static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
433
0
    FUNCTABLE_INIT_ABORT;
434
0
    return functable.longest_match_slow(s, cur_match);
435
0
}
436
437
0
static void slide_hash_stub(deflate_state* s) {
438
0
    FUNCTABLE_INIT_ABORT;
439
0
    functable.slide_hash(s);
440
0
}
441
442
/* functable init */
443
Z_INTERNAL struct functable_s functable = {
444
    force_init_stub,
445
    adler32_stub,
446
    adler32_fold_copy_stub,
447
    chunkmemset_safe_stub,
448
    compare256_stub,
449
    crc32_stub,
450
    crc32_fold_stub,
451
    crc32_fold_copy_stub,
452
    crc32_fold_final_stub,
453
    crc32_fold_reset_stub,
454
    inflate_fast_stub,
455
    longest_match_stub,
456
    longest_match_slow_stub,
457
    slide_hash_stub,
458
};
459
460
#endif