Coverage Report

Created: 2025-07-18 06:59

/src/zlib-ng/functable.c
Line
Count
Source (jump to first uncovered line)
1
/* functable.c -- Choose relevant optimized functions at runtime
2
 * Copyright (C) 2017 Hans Kristian Rosbach
3
 * For conditions of distribution and use, see copyright notice in zlib.h
4
 */
5
#ifndef DISABLE_RUNTIME_CPU_DETECTION
6
7
#include "zbuild.h"
8
9
#if defined(_MSC_VER)
10
#  include <intrin.h>
11
#endif
12
13
#include "functable.h"
14
#include "cpu_features.h"
15
#include "arch_functions.h"
16
17
/* Platform has pointer size atomic store */
18
#if defined(__GNUC__) || defined(__clang__)
19
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
20
15
    __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST)
21
1
#  define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST)
22
#elif defined(_MSC_VER)
23
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
24
    _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME))
25
#  if defined(_M_ARM) || defined(_M_ARM64)
26
#    define FUNCTABLE_BARRIER() do { \
27
    _ReadWriteBarrier();  \
28
    __dmb(0xB); /* _ARM_BARRIER_ISH */ \
29
    _ReadWriteBarrier(); \
30
} while (0)
31
#  else
32
#    define FUNCTABLE_BARRIER() _ReadWriteBarrier()
33
#  endif
34
#else
35
#  warning Unable to detect atomic intrinsic support.
36
#  define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \
37
    *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME)
38
#  define FUNCTABLE_BARRIER() do { /* Empty */ } while (0)
39
#endif
40
41
2.87k
static void force_init_empty(void) {
42
    // empty
43
2.87k
}
44
45
1
static void init_functable(void) {
46
1
    struct functable_s ft;
47
1
    struct cpu_features cf;
48
49
1
    cpu_check_features(&cf);
50
51
    // Generic code
52
1
    ft.force_init = &force_init_empty;
53
1
    ft.adler32 = &adler32_c;
54
1
    ft.adler32_fold_copy = &adler32_fold_copy_c;
55
1
    ft.chunkmemset_safe = &chunkmemset_safe_c;
56
1
    ft.chunksize = &chunksize_c;
57
1
    ft.crc32 = &crc32_c;
58
1
    ft.crc32_fold = &crc32_fold_c;
59
1
    ft.crc32_fold_copy = &crc32_fold_copy_c;
60
1
    ft.crc32_fold_final = &crc32_fold_final_c;
61
1
    ft.crc32_fold_reset = &crc32_fold_reset_c;
62
1
    ft.inflate_fast = &inflate_fast_c;
63
1
    ft.slide_hash = &slide_hash_c;
64
1
    ft.longest_match = &longest_match_c;
65
1
    ft.longest_match_slow = &longest_match_slow_c;
66
1
    ft.compare256 = &compare256_c;
67
68
    // Select arch-optimized functions
69
70
    // X86 - SSE2
71
1
#ifdef X86_SSE2
72
#  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
73
    if (cf.x86.has_sse2)
74
#  endif
75
1
    {
76
1
        ft.chunkmemset_safe = &chunkmemset_safe_sse2;
77
1
        ft.chunksize = &chunksize_sse2;
78
1
#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
79
1
        ft.crc32 = &crc32_chorba_sse2;
80
1
#endif
81
1
        ft.inflate_fast = &inflate_fast_sse2;
82
1
        ft.slide_hash = &slide_hash_sse2;
83
1
#  ifdef HAVE_BUILTIN_CTZ
84
1
        ft.compare256 = &compare256_sse2;
85
1
        ft.longest_match = &longest_match_sse2;
86
1
        ft.longest_match_slow = &longest_match_slow_sse2;
87
1
#  endif
88
1
    }
89
1
#endif
90
    // X86 - SSSE3
91
1
#ifdef X86_SSSE3
92
1
    if (cf.x86.has_ssse3) {
93
1
        ft.adler32 = &adler32_ssse3;
94
1
        ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
95
1
        ft.inflate_fast = &inflate_fast_ssse3;
96
1
    }
97
1
#endif
98
99
    // X86 - SSE4.1
100
1
#ifdef X86_SSE41
101
1
    if (cf.x86.has_sse41) {
102
1
#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE)
103
1
        ft.crc32 = &crc32_chorba_sse41;
104
1
#endif
105
1
    }
106
1
#endif
107
108
    // X86 - SSE4.2
109
1
#ifdef X86_SSE42
110
1
    if (cf.x86.has_sse42) {
111
1
        ft.adler32_fold_copy = &adler32_fold_copy_sse42;
112
1
    }
113
1
#endif
114
    // X86 - PCLMUL
115
1
#ifdef X86_PCLMULQDQ_CRC
116
1
    if (cf.x86.has_pclmulqdq) {
117
1
        ft.crc32 = &crc32_pclmulqdq;
118
1
        ft.crc32_fold = &crc32_fold_pclmulqdq;
119
1
        ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy;
120
1
        ft.crc32_fold_final = &crc32_fold_pclmulqdq_final;
121
1
        ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset;
122
1
    }
123
1
#endif
124
    // X86 - AVX
125
1
#ifdef X86_AVX2
126
    /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
127
     * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
128
     * for the shift results as an operand, eliminating several register-register moves when the original value needs
129
     * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
130
1
    if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
131
1
        ft.adler32 = &adler32_avx2;
132
1
        ft.adler32_fold_copy = &adler32_fold_copy_avx2;
133
1
        ft.chunkmemset_safe = &chunkmemset_safe_avx2;
134
1
        ft.chunksize = &chunksize_avx2;
135
1
        ft.inflate_fast = &inflate_fast_avx2;
136
1
        ft.slide_hash = &slide_hash_avx2;
137
1
#  ifdef HAVE_BUILTIN_CTZ
138
1
        ft.compare256 = &compare256_avx2;
139
1
        ft.longest_match = &longest_match_avx2;
140
1
        ft.longest_match_slow = &longest_match_slow_avx2;
141
1
#  endif
142
1
    }
143
1
#endif
144
    // X86 - AVX512 (F,DQ,BW,Vl)
145
1
#ifdef X86_AVX512
146
1
    if (cf.x86.has_avx512_common) {
147
0
        ft.adler32 = &adler32_avx512;
148
0
        ft.adler32_fold_copy = &adler32_fold_copy_avx512;
149
0
        ft.chunkmemset_safe = &chunkmemset_safe_avx512;
150
0
        ft.chunksize = &chunksize_avx512;
151
0
        ft.inflate_fast = &inflate_fast_avx512;
152
0
#  ifdef HAVE_BUILTIN_CTZLL
153
0
        ft.compare256 = &compare256_avx512;
154
0
        ft.longest_match = &longest_match_avx512;
155
0
        ft.longest_match_slow = &longest_match_slow_avx512;
156
0
#  endif
157
0
    }
158
1
#endif
159
1
#ifdef X86_AVX512VNNI
160
1
    if (cf.x86.has_avx512vnni) {
161
0
        ft.adler32 = &adler32_avx512_vnni;
162
0
        ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
163
0
    }
164
1
#endif
165
    // X86 - VPCLMULQDQ
166
1
#ifdef X86_VPCLMULQDQ_CRC
167
1
    if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) {
168
0
        ft.crc32 = &crc32_vpclmulqdq;
169
0
        ft.crc32_fold = &crc32_fold_vpclmulqdq;
170
0
        ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
171
0
        ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
172
0
        ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
173
0
    }
174
1
#endif
175
176
177
    // ARM - SIMD
178
#ifdef ARM_SIMD
179
#  ifndef ARM_NOCHECK_SIMD
180
    if (cf.arm.has_simd)
181
#  endif
182
    {
183
        ft.slide_hash = &slide_hash_armv6;
184
    }
185
#endif
186
    // ARM - NEON
187
#ifdef ARM_NEON
188
#  ifndef ARM_NOCHECK_NEON
189
    if (cf.arm.has_neon)
190
#  endif
191
    {
192
        ft.adler32 = &adler32_neon;
193
        ft.adler32_fold_copy = &adler32_fold_copy_neon;
194
        ft.chunkmemset_safe = &chunkmemset_safe_neon;
195
        ft.chunksize = &chunksize_neon;
196
        ft.inflate_fast = &inflate_fast_neon;
197
        ft.slide_hash = &slide_hash_neon;
198
#  ifdef HAVE_BUILTIN_CTZLL
199
        ft.compare256 = &compare256_neon;
200
        ft.longest_match = &longest_match_neon;
201
        ft.longest_match_slow = &longest_match_slow_neon;
202
#  endif
203
    }
204
#endif
205
    // ARM - CRC32
206
#ifdef ARM_CRC32
207
    if (cf.arm.has_crc32) {
208
        ft.crc32 = &crc32_armv8;
209
    }
210
#endif
211
212
213
    // Power - VMX
214
#ifdef PPC_VMX
215
    if (cf.power.has_altivec) {
216
        ft.adler32 = &adler32_vmx;
217
        ft.slide_hash = &slide_hash_vmx;
218
    }
219
#endif
220
    // Power8 - VSX
221
#ifdef POWER8_VSX
222
    if (cf.power.has_arch_2_07) {
223
        ft.adler32 = &adler32_power8;
224
        ft.chunkmemset_safe = &chunkmemset_safe_power8;
225
        ft.chunksize = &chunksize_power8;
226
        ft.inflate_fast = &inflate_fast_power8;
227
        ft.slide_hash = &slide_hash_power8;
228
    }
229
#endif
230
#ifdef POWER8_VSX_CRC32
231
    if (cf.power.has_arch_2_07)
232
        ft.crc32 = &crc32_power8;
233
#endif
234
    // Power9
235
#ifdef POWER9
236
    if (cf.power.has_arch_3_00) {
237
        ft.compare256 = &compare256_power9;
238
        ft.longest_match = &longest_match_power9;
239
        ft.longest_match_slow = &longest_match_slow_power9;
240
    }
241
#endif
242
243
244
    // RISCV - RVV
245
#ifdef RISCV_RVV
246
    if (cf.riscv.has_rvv) {
247
        ft.adler32 = &adler32_rvv;
248
        ft.adler32_fold_copy = &adler32_fold_copy_rvv;
249
        ft.chunkmemset_safe = &chunkmemset_safe_rvv;
250
        ft.chunksize = &chunksize_rvv;
251
        ft.compare256 = &compare256_rvv;
252
        ft.inflate_fast = &inflate_fast_rvv;
253
        ft.longest_match = &longest_match_rvv;
254
        ft.longest_match_slow = &longest_match_slow_rvv;
255
        ft.slide_hash = &slide_hash_rvv;
256
    }
257
#endif
258
259
    // RISCV - ZBC
260
#ifdef RISCV_CRC32_ZBC
261
    if (cf.riscv.has_zbc) {
262
        ft.crc32 = &crc32_riscv64_zbc;
263
    }
264
#endif
265
266
    // S390
267
#ifdef S390_CRC32_VX
268
    if (cf.s390.has_vx)
269
        ft.crc32 = crc32_s390_vx;
270
#endif
271
272
    // LOONGARCH
273
#ifdef LOONGARCH_CRC
274
    if (cf.loongarch.has_crc) {
275
        ft.crc32 = crc32_loongarch64;
276
        ft.crc32_fold = &crc32_fold_loongarch64;
277
        ft.crc32_fold_copy = &crc32_fold_copy_loongarch64;
278
    }
279
#endif
280
#ifdef LOONGARCH_LSX
281
    if (cf.loongarch.has_lsx) {
282
        ft.adler32 = &adler32_lsx;
283
        ft.adler32_fold_copy = &adler32_fold_copy_lsx;
284
        ft.slide_hash = slide_hash_lsx;
285
#  ifdef HAVE_BUILTIN_CTZ
286
        ft.compare256 = &compare256_lsx;
287
        ft.longest_match = &longest_match_lsx;
288
        ft.longest_match_slow = &longest_match_slow_lsx;
289
#  endif
290
        ft.chunksize = &chunksize_lsx;
291
        ft.chunkmemset_safe = &chunkmemset_safe_lsx;
292
        ft.inflate_fast = &inflate_fast_lsx;
293
    }
294
#endif
295
#ifdef LOONGARCH_LASX
296
    if (cf.loongarch.has_lasx) {
297
        ft.adler32 = &adler32_lasx;
298
        ft.adler32_fold_copy = &adler32_fold_copy_lasx;
299
        ft.slide_hash = slide_hash_lasx;
300
#  ifdef HAVE_BUILTIN_CTZ
301
        ft.compare256 = &compare256_lasx;
302
        ft.longest_match = &longest_match_lasx;
303
        ft.longest_match_slow = &longest_match_slow_lasx;
304
#  endif
305
        ft.chunksize = &chunksize_lasx;
306
        ft.chunkmemset_safe = &chunkmemset_safe_lasx;
307
        ft.inflate_fast = &inflate_fast_lasx;
308
    }
309
#endif
310
311
    // Assign function pointers individually for atomic operation
312
1
    FUNCTABLE_ASSIGN(ft, force_init);
313
1
    FUNCTABLE_ASSIGN(ft, adler32);
314
1
    FUNCTABLE_ASSIGN(ft, adler32_fold_copy);
315
1
    FUNCTABLE_ASSIGN(ft, chunkmemset_safe);
316
1
    FUNCTABLE_ASSIGN(ft, chunksize);
317
1
    FUNCTABLE_ASSIGN(ft, compare256);
318
1
    FUNCTABLE_ASSIGN(ft, crc32);
319
1
    FUNCTABLE_ASSIGN(ft, crc32_fold);
320
1
    FUNCTABLE_ASSIGN(ft, crc32_fold_copy);
321
1
    FUNCTABLE_ASSIGN(ft, crc32_fold_final);
322
1
    FUNCTABLE_ASSIGN(ft, crc32_fold_reset);
323
1
    FUNCTABLE_ASSIGN(ft, inflate_fast);
324
1
    FUNCTABLE_ASSIGN(ft, longest_match);
325
1
    FUNCTABLE_ASSIGN(ft, longest_match_slow);
326
1
    FUNCTABLE_ASSIGN(ft, slide_hash);
327
328
    // Memory barrier for weak memory order CPUs
329
1
    FUNCTABLE_BARRIER();
330
1
}
331
332
/* stub functions */
333
1
static void force_init_stub(void) {
334
1
    init_functable();
335
1
}
336
337
0
static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) {
338
0
    init_functable();
339
0
    return functable.adler32(adler, buf, len);
340
0
}
341
342
0
static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) {
343
0
    init_functable();
344
0
    return functable.adler32_fold_copy(adler, dst, src, len);
345
0
}
346
347
0
static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
348
0
    init_functable();
349
0
    return functable.chunkmemset_safe(out, from, len, left);
350
0
}
351
352
0
static uint32_t chunksize_stub(void) {
353
0
    init_functable();
354
0
    return functable.chunksize();
355
0
}
356
357
0
static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) {
358
0
    init_functable();
359
0
    return functable.compare256(src0, src1);
360
0
}
361
362
0
static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) {
363
0
    init_functable();
364
0
    return functable.crc32(crc, buf, len);
365
0
}
366
367
0
static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) {
368
0
    init_functable();
369
0
    functable.crc32_fold(crc, src, len, init_crc);
370
0
}
371
372
0
static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) {
373
0
    init_functable();
374
0
    functable.crc32_fold_copy(crc, dst, src, len);
375
0
}
376
377
0
static uint32_t crc32_fold_final_stub(crc32_fold* crc) {
378
0
    init_functable();
379
0
    return functable.crc32_fold_final(crc);
380
0
}
381
382
0
static uint32_t crc32_fold_reset_stub(crc32_fold* crc) {
383
0
    init_functable();
384
0
    return functable.crc32_fold_reset(crc);
385
0
}
386
387
0
static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) {
388
0
    init_functable();
389
0
    functable.inflate_fast(strm, start);
390
0
}
391
392
0
static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) {
393
0
    init_functable();
394
0
    return functable.longest_match(s, cur_match);
395
0
}
396
397
0
static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) {
398
0
    init_functable();
399
0
    return functable.longest_match_slow(s, cur_match);
400
0
}
401
402
0
static void slide_hash_stub(deflate_state* s) {
403
0
    init_functable();
404
0
    functable.slide_hash(s);
405
0
}
406
407
/* functable init */
408
Z_INTERNAL struct functable_s functable = {
409
    force_init_stub,
410
    adler32_stub,
411
    adler32_fold_copy_stub,
412
    chunkmemset_safe_stub,
413
    chunksize_stub,
414
    compare256_stub,
415
    crc32_stub,
416
    crc32_fold_stub,
417
    crc32_fold_copy_stub,
418
    crc32_fold_final_stub,
419
    crc32_fold_reset_stub,
420
    inflate_fast_stub,
421
    longest_match_stub,
422
    longest_match_slow_stub,
423
    slide_hash_stub,
424
};
425
426
#endif