Line | Count | Source |
1 | | /* functable.c -- Choose relevant optimized functions at runtime |
2 | | * Copyright (C) 2017 Hans Kristian Rosbach |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | #ifndef DISABLE_RUNTIME_CPU_DETECTION |
6 | | |
7 | | #include "zbuild.h" |
8 | | |
9 | | #if defined(_MSC_VER) |
10 | | # include <intrin.h> |
11 | | #endif |
12 | | |
13 | | #include "functable.h" |
14 | | #include "cpu_features.h" |
15 | | #include "arch_functions.h" |
16 | | |
17 | | /* Platform has pointer size atomic store */ |
18 | | #if defined(__GNUC__) || defined(__clang__) |
19 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
20 | 11 | __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) |
21 | 1 | # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
22 | | #elif defined(_MSC_VER) |
23 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
24 | | _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) |
25 | | # ifdef ARCH_ARM |
26 | | # define FUNCTABLE_BARRIER() do { \ |
27 | | _ReadWriteBarrier(); \ |
28 | | __dmb(0xB); /* _ARM_BARRIER_ISH */ \ |
29 | | _ReadWriteBarrier(); \ |
30 | | } while (0) |
31 | | # else |
32 | | # define FUNCTABLE_BARRIER() _ReadWriteBarrier() |
33 | | # endif |
34 | | #else |
35 | | # warning Unable to detect atomic intrinsic support. |
36 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
37 | | *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) |
38 | | # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) |
39 | | #endif |
40 | | |
41 | | /* Verify all pointers are valid before assigning, return 1 on failure |
42 | | * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR |
43 | | * if functable initialization fails. |
44 | | */ |
45 | | #define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \ |
46 | 10 | if (!VAR.FUNC_NAME) { \ |
47 | 0 | fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ |
48 | 0 | return 1; \ |
49 | 0 | } \ |
50 | 10 | FUNCTABLE_ASSIGN(VAR, FUNC_NAME); |
51 | | |
52 | | /* Functable init & abort on failure. |
53 | | * Abort is needed because some stub functions are reachable without first |
54 | | * calling any inflateinit/deflateinit functions, and have no error propagation. |
55 | | */ |
56 | | #define FUNCTABLE_INIT_ABORT \ |
57 | 0 | if (init_functable()) { \ |
58 | 0 | fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ |
59 | 0 | abort(); \ |
60 | 0 | }; |
61 | | |
62 | | // Empty stub, used when functable has already been initialized |
63 | 7.94k | static int force_init_empty(void) { |
64 | 7.94k | return 0; |
65 | 7.94k | } |
66 | | |
67 | | /* Functable initialization. |
68 | | * Selects the best available optimized functions appropriate for the runtime cpu. |
69 | | */ |
70 | 1 | static int init_functable(void) { |
71 | 1 | struct functable_s ft; |
72 | 1 | struct cpu_features cf; |
73 | | |
74 | 1 | memset(&ft, 0, sizeof(struct functable_s)); |
75 | 1 | cpu_check_features(&cf); |
76 | 1 | ft.force_init = &force_init_empty; |
77 | | |
78 | | // Set up generic C code fallbacks |
79 | 1 | #ifndef WITH_ALL_FALLBACKS |
80 | 1 | # if defined(ARCH_X86) && defined(ARCH_64BIT) && defined(X86_SSE2) |
81 | | // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available. |
82 | 1 | ft.adler32 = &adler32_c; |
83 | 1 | ft.adler32_copy = &adler32_copy_c; |
84 | 1 | ft.crc32 = &crc32_braid; |
85 | 1 | ft.crc32_copy = &crc32_copy_braid; |
86 | | # ifndef HAVE_BUILTIN_CTZ |
87 | | ft.longest_match = &longest_match_c; |
88 | | ft.longest_match_slow = &longest_match_slow_c; |
89 | | ft.compare256 = &compare256_c; |
90 | | # endif |
91 | 1 | # endif |
92 | | #else // WITH_ALL_FALLBACKS |
93 | | ft.adler32 = &adler32_c; |
94 | | ft.adler32_copy = &adler32_copy_c; |
95 | | ft.chunkmemset_safe = &chunkmemset_safe_c; |
96 | | ft.crc32 = &crc32_braid; |
97 | | ft.crc32_copy = &crc32_copy_braid; |
98 | | ft.inflate_fast = &inflate_fast_c; |
99 | | ft.slide_hash = &slide_hash_c; |
100 | | ft.longest_match = &longest_match_c; |
101 | | ft.longest_match_slow = &longest_match_slow_c; |
102 | | ft.compare256 = &compare256_c; |
103 | | #endif |
104 | | |
105 | | // Select arch-optimized functions |
106 | 1 | #ifdef WITH_OPTIM |
107 | | |
108 | | // Chorba generic C fallback |
109 | 1 | #ifndef WITHOUT_CHORBA |
110 | 1 | ft.crc32 = &crc32_chorba; |
111 | 1 | ft.crc32_copy = &crc32_copy_chorba; |
112 | 1 | #endif |
113 | | |
114 | | // X86 - SSE2 |
115 | 1 | #ifdef X86_SSE2 |
116 | | # ifdef ARCH_32BIT |
117 | | if (cf.x86.has_sse2) |
118 | | # endif |
119 | 1 | { |
120 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_sse2; |
121 | 1 | # if !defined(WITHOUT_CHORBA_SSE) |
122 | 1 | ft.crc32 = &crc32_chorba_sse2; |
123 | 1 | ft.crc32_copy = &crc32_copy_chorba_sse2; |
124 | 1 | # endif |
125 | 1 | ft.inflate_fast = &inflate_fast_sse2; |
126 | 1 | ft.slide_hash = &slide_hash_sse2; |
127 | 1 | # ifdef HAVE_BUILTIN_CTZ |
128 | 1 | ft.compare256 = &compare256_sse2; |
129 | 1 | ft.longest_match = &longest_match_sse2; |
130 | 1 | ft.longest_match_slow = &longest_match_slow_sse2; |
131 | 1 | # endif |
132 | 1 | } |
133 | 1 | #endif |
134 | | // X86 - SSSE3 |
135 | 1 | #ifdef X86_SSSE3 |
136 | 1 | if (cf.x86.has_ssse3) { |
137 | 1 | ft.adler32 = &adler32_ssse3; |
138 | 1 | ft.adler32_copy = &adler32_copy_ssse3; |
139 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_ssse3; |
140 | 1 | ft.inflate_fast = &inflate_fast_ssse3; |
141 | 1 | } |
142 | 1 | #endif |
143 | | |
144 | | // X86 - SSE4.1 |
145 | 1 | #if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) |
146 | 1 | if (cf.x86.has_sse41) { |
147 | 1 | ft.crc32 = &crc32_chorba_sse41; |
148 | 1 | ft.crc32_copy = &crc32_copy_chorba_sse41; |
149 | 1 | } |
150 | 1 | #endif |
151 | | |
152 | | // X86 - SSE4.2 |
153 | 1 | #ifdef X86_SSE42 |
154 | 1 | if (cf.x86.has_sse42) { |
155 | 1 | ft.adler32_copy = &adler32_copy_sse42; |
156 | 1 | } |
157 | 1 | #endif |
158 | | // X86 - PCLMUL |
159 | 1 | #ifdef X86_PCLMULQDQ_CRC |
160 | 1 | if (cf.x86.has_pclmulqdq) { |
161 | 1 | ft.crc32 = &crc32_pclmulqdq; |
162 | 1 | ft.crc32_copy = &crc32_copy_pclmulqdq; |
163 | 1 | } |
164 | 1 | #endif |
165 | | // X86 - AVX |
166 | 1 | #ifdef X86_AVX2 |
167 | | /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for |
168 | | * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers |
169 | | * for the shift results as an operand, eliminating several register-register moves when the original value needs |
170 | | * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ |
171 | 1 | if (cf.x86.has_avx2 && cf.x86.has_bmi2) { |
172 | 1 | ft.adler32 = &adler32_avx2; |
173 | 1 | ft.adler32_copy = &adler32_copy_avx2; |
174 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_avx2; |
175 | 1 | ft.inflate_fast = &inflate_fast_avx2; |
176 | 1 | ft.slide_hash = &slide_hash_avx2; |
177 | 1 | # ifdef HAVE_BUILTIN_CTZ |
178 | 1 | ft.compare256 = &compare256_avx2; |
179 | 1 | ft.longest_match = &longest_match_avx2; |
180 | 1 | ft.longest_match_slow = &longest_match_slow_avx2; |
181 | 1 | # endif |
182 | 1 | } |
183 | 1 | #endif |
184 | | // X86 - AVX512 (F,DQ,BW,Vl) |
185 | 1 | #ifdef X86_AVX512 |
186 | 1 | if (cf.x86.has_avx512_common) { |
187 | 0 | ft.adler32 = &adler32_avx512; |
188 | 0 | ft.adler32_copy = &adler32_copy_avx512; |
189 | 0 | ft.chunkmemset_safe = &chunkmemset_safe_avx512; |
190 | 0 | ft.inflate_fast = &inflate_fast_avx512; |
191 | 0 | # ifdef HAVE_BUILTIN_CTZLL |
192 | 0 | ft.compare256 = &compare256_avx512; |
193 | 0 | ft.longest_match = &longest_match_avx512; |
194 | 0 | ft.longest_match_slow = &longest_match_slow_avx512; |
195 | 0 | # endif |
196 | 0 | } |
197 | 1 | #endif |
198 | 1 | #ifdef X86_AVX512VNNI |
199 | 1 | if (cf.x86.has_avx512vnni) { |
200 | 0 | ft.adler32 = &adler32_avx512_vnni; |
201 | 0 | ft.adler32_copy = &adler32_copy_avx512_vnni; |
202 | 0 | } |
203 | 1 | #endif |
204 | | // X86 - VPCLMULQDQ |
205 | 1 | #ifdef X86_VPCLMULQDQ_CRC |
206 | 1 | if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) { |
207 | 0 | ft.crc32 = &crc32_vpclmulqdq; |
208 | 0 | ft.crc32_copy = &crc32_copy_vpclmulqdq; |
209 | 0 | } |
210 | 1 | #endif |
211 | | |
212 | | |
213 | | // ARM - SIMD |
214 | | #ifdef ARM_SIMD |
215 | | # ifndef ARM_NOCHECK_SIMD |
216 | | if (cf.arm.has_simd) |
217 | | # endif |
218 | | { |
219 | | ft.slide_hash = &slide_hash_armv6; |
220 | | } |
221 | | #endif |
222 | | // ARM - NEON |
223 | | #ifdef ARM_NEON |
224 | | # ifndef ARM_NOCHECK_NEON |
225 | | if (cf.arm.has_neon) |
226 | | # endif |
227 | | { |
228 | | ft.adler32 = &adler32_neon; |
229 | | ft.adler32_copy = &adler32_copy_neon; |
230 | | ft.chunkmemset_safe = &chunkmemset_safe_neon; |
231 | | ft.inflate_fast = &inflate_fast_neon; |
232 | | ft.slide_hash = &slide_hash_neon; |
233 | | # ifdef HAVE_BUILTIN_CTZLL |
234 | | ft.compare256 = &compare256_neon; |
235 | | ft.longest_match = &longest_match_neon; |
236 | | ft.longest_match_slow = &longest_match_slow_neon; |
237 | | # endif |
238 | | } |
239 | | #endif |
240 | | // ARM - CRC32 |
241 | | #ifdef ARM_CRC32 |
242 | | if (cf.arm.has_crc32) { |
243 | | ft.crc32 = &crc32_armv8; |
244 | | ft.crc32_copy = &crc32_copy_armv8; |
245 | | } |
246 | | #endif |
247 | | // ARM - PMULL EOR3 |
248 | | #ifdef ARM_PMULL_EOR3 |
249 | | if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) { |
250 | | ft.crc32 = &crc32_armv8_pmull_eor3; |
251 | | ft.crc32_copy = &crc32_copy_armv8_pmull_eor3; |
252 | | } |
253 | | #endif |
254 | | |
255 | | // Power - VMX |
256 | | #ifdef PPC_VMX |
257 | | if (cf.power.has_altivec) { |
258 | | ft.adler32 = &adler32_vmx; |
259 | | ft.adler32_copy = &adler32_copy_vmx; |
260 | | ft.slide_hash = &slide_hash_vmx; |
261 | | } |
262 | | #endif |
263 | | // Power8 - VSX |
264 | | #ifdef POWER8_VSX |
265 | | if (cf.power.has_arch_2_07) { |
266 | | ft.adler32 = &adler32_power8; |
267 | | ft.adler32_copy = &adler32_copy_power8; |
268 | | ft.chunkmemset_safe = &chunkmemset_safe_power8; |
269 | | ft.inflate_fast = &inflate_fast_power8; |
270 | | ft.slide_hash = &slide_hash_power8; |
271 | | } |
272 | | #endif |
273 | | #ifdef POWER8_VSX_CRC32 |
274 | | if (cf.power.has_arch_2_07) { |
275 | | ft.crc32 = &crc32_power8; |
276 | | ft.crc32_copy = &crc32_copy_power8; |
277 | | } |
278 | | #endif |
279 | | // Power9 |
280 | | #ifdef POWER9 |
281 | | if (cf.power.has_arch_3_00) { |
282 | | ft.compare256 = &compare256_power9; |
283 | | ft.longest_match = &longest_match_power9; |
284 | | ft.longest_match_slow = &longest_match_slow_power9; |
285 | | } |
286 | | #endif |
287 | | |
288 | | |
289 | | // RISCV - RVV |
290 | | #ifdef RISCV_RVV |
291 | | if (cf.riscv.has_rvv) { |
292 | | ft.adler32 = &adler32_rvv; |
293 | | ft.adler32_copy = &adler32_copy_rvv; |
294 | | ft.chunkmemset_safe = &chunkmemset_safe_rvv; |
295 | | ft.compare256 = &compare256_rvv; |
296 | | ft.inflate_fast = &inflate_fast_rvv; |
297 | | ft.longest_match = &longest_match_rvv; |
298 | | ft.longest_match_slow = &longest_match_slow_rvv; |
299 | | ft.slide_hash = &slide_hash_rvv; |
300 | | } |
301 | | #endif |
302 | | |
303 | | // RISCV - ZBC |
304 | | #ifdef RISCV_CRC32_ZBC |
305 | | if (cf.riscv.has_zbc) { |
306 | | ft.crc32 = &crc32_riscv64_zbc; |
307 | | ft.crc32_copy = &crc32_copy_riscv64_zbc; |
308 | | } |
309 | | #endif |
310 | | |
311 | | // S390 |
312 | | #ifdef S390_CRC32_VX |
313 | | if (cf.s390.has_vx) { |
314 | | ft.crc32 = crc32_s390_vx; |
315 | | ft.crc32_copy = crc32_copy_s390_vx; |
316 | | } |
317 | | #endif |
318 | | |
319 | | // LOONGARCH |
320 | | #ifdef LOONGARCH_CRC |
321 | | if (cf.loongarch.has_crc) { |
322 | | ft.crc32 = crc32_loongarch64; |
323 | | ft.crc32_copy = crc32_copy_loongarch64; |
324 | | } |
325 | | #endif |
326 | | #ifdef LOONGARCH_LSX |
327 | | if (cf.loongarch.has_lsx) { |
328 | | ft.adler32 = &adler32_lsx; |
329 | | ft.adler32_copy = &adler32_copy_lsx; |
330 | | ft.slide_hash = slide_hash_lsx; |
331 | | # ifdef HAVE_BUILTIN_CTZ |
332 | | ft.compare256 = &compare256_lsx; |
333 | | ft.longest_match = &longest_match_lsx; |
334 | | ft.longest_match_slow = &longest_match_slow_lsx; |
335 | | # endif |
336 | | ft.chunkmemset_safe = &chunkmemset_safe_lsx; |
337 | | ft.inflate_fast = &inflate_fast_lsx; |
338 | | } |
339 | | #endif |
340 | | #ifdef LOONGARCH_LASX |
341 | | if (cf.loongarch.has_lasx) { |
342 | | ft.adler32 = &adler32_lasx; |
343 | | ft.adler32_copy = &adler32_copy_lasx; |
344 | | ft.slide_hash = slide_hash_lasx; |
345 | | # ifdef HAVE_BUILTIN_CTZ |
346 | | ft.compare256 = &compare256_lasx; |
347 | | ft.longest_match = &longest_match_lasx; |
348 | | ft.longest_match_slow = &longest_match_slow_lasx; |
349 | | # endif |
350 | | ft.chunkmemset_safe = &chunkmemset_safe_lasx; |
351 | | ft.inflate_fast = &inflate_fast_lasx; |
352 | | } |
353 | | #endif |
354 | | |
355 | 1 | #endif // WITH_OPTIM |
356 | | |
357 | | // Assign function pointers individually for atomic operation |
358 | 1 | FUNCTABLE_ASSIGN(ft, force_init); |
359 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, adler32); |
360 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy); |
361 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe); |
362 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, compare256); |
363 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, crc32); |
364 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy); |
365 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast); |
366 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, longest_match); |
367 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow); |
368 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash); |
369 | | |
370 | | // Memory barrier for weak memory order CPUs |
371 | 1 | FUNCTABLE_BARRIER(); |
372 | | |
373 | 1 | return Z_OK; |
374 | 1 | } |
375 | | |
376 | | /* stub functions */ |
377 | 1 | static int force_init_stub(void) { |
378 | 1 | return init_functable(); |
379 | 1 | } |
380 | | |
381 | 0 | static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { |
382 | 0 | FUNCTABLE_INIT_ABORT; |
383 | 0 | return functable.adler32(adler, buf, len); |
384 | 0 | } |
385 | | |
386 | 0 | static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { |
387 | 0 | FUNCTABLE_INIT_ABORT; |
388 | 0 | return functable.adler32_copy(adler, dst, src, len); |
389 | 0 | } |
390 | | |
391 | 0 | static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) { |
392 | 0 | FUNCTABLE_INIT_ABORT; |
393 | 0 | return functable.chunkmemset_safe(out, from, len, left); |
394 | 0 | } |
395 | | |
396 | 0 | static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { |
397 | 0 | FUNCTABLE_INIT_ABORT; |
398 | 0 | return functable.compare256(src0, src1); |
399 | 0 | } |
400 | | |
401 | 0 | static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { |
402 | 0 | FUNCTABLE_INIT_ABORT; |
403 | 0 | return functable.crc32(crc, buf, len); |
404 | 0 | } |
405 | | |
406 | 0 | static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { |
407 | 0 | FUNCTABLE_INIT_ABORT; |
408 | 0 | return functable.crc32_copy(crc, dst, src, len); |
409 | 0 | } |
410 | | |
411 | 0 | static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { |
412 | 0 | FUNCTABLE_INIT_ABORT; |
413 | 0 | functable.inflate_fast(strm, start); |
414 | 0 | } |
415 | | |
416 | 0 | static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) { |
417 | 0 | FUNCTABLE_INIT_ABORT; |
418 | 0 | return functable.longest_match(s, cur_match); |
419 | 0 | } |
420 | | |
421 | 0 | static uint32_t longest_match_slow_stub(deflate_state* const s, uint32_t cur_match) { |
422 | 0 | FUNCTABLE_INIT_ABORT; |
423 | 0 | return functable.longest_match_slow(s, cur_match); |
424 | 0 | } |
425 | | |
426 | 0 | static void slide_hash_stub(deflate_state* s) { |
427 | 0 | FUNCTABLE_INIT_ABORT; |
428 | 0 | functable.slide_hash(s); |
429 | 0 | } |
430 | | |
431 | | /* functable init */ |
432 | | Z_INTERNAL struct functable_s functable = { |
433 | | force_init_stub, |
434 | | adler32_stub, |
435 | | adler32_copy_stub, |
436 | | chunkmemset_safe_stub, |
437 | | compare256_stub, |
438 | | crc32_stub, |
439 | | crc32_copy_stub, |
440 | | inflate_fast_stub, |
441 | | longest_match_stub, |
442 | | longest_match_slow_stub, |
443 | | slide_hash_stub, |
444 | | }; |
445 | | |
446 | | #endif |