Line | Count | Source |
1 | | /* functable.c -- Choose relevant optimized functions at runtime |
2 | | * Copyright (C) 2017 Hans Kristian Rosbach |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | #ifndef DISABLE_RUNTIME_CPU_DETECTION |
6 | | |
7 | | #include "zbuild.h" |
8 | | |
9 | | #if defined(_MSC_VER) |
10 | | # include <intrin.h> |
11 | | #endif |
12 | | |
13 | | #include "functable.h" |
14 | | #include "cpu_features.h" |
15 | | #include "arch_functions.h" |
16 | | |
17 | | /* Platform has pointer size atomic store */ |
18 | | #if defined(__GNUC__) || defined(__clang__) |
19 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
20 | 11 | __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) |
21 | 1 | # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
22 | | #elif defined(_MSC_VER) |
23 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
24 | | _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) |
25 | | # ifdef ARCH_ARM |
26 | | # define FUNCTABLE_BARRIER() do { \ |
27 | | _ReadWriteBarrier(); \ |
28 | | __dmb(0xB); /* _ARM_BARRIER_ISH */ \ |
29 | | _ReadWriteBarrier(); \ |
30 | | } while (0) |
31 | | # else |
32 | | # define FUNCTABLE_BARRIER() _ReadWriteBarrier() |
33 | | # endif |
34 | | #else |
35 | | # warning Unable to detect atomic intrinsic support. |
36 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
37 | | *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) |
38 | | # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) |
39 | | #endif |
40 | | |
41 | | /* Verify all pointers are valid before assigning, return 1 on failure |
42 | | * This allows inflateinit/deflateinit functions to gracefully return Z_VERSION_ERROR |
43 | | * if functable initialization fails. |
44 | | */ |
45 | | #define FUNCTABLE_VERIFY_ASSIGN(VAR, FUNC_NAME) \ |
46 | 10 | if (!VAR.FUNC_NAME) { \ |
47 | 0 | fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ |
48 | 0 | return 1; \ |
49 | 0 | } \ |
50 | 10 | FUNCTABLE_ASSIGN(VAR, FUNC_NAME); |
51 | | |
52 | | /* Functable init & abort on failure. |
53 | | * Abort is needed because some stub functions are reachable without first |
54 | | * calling any inflateinit/deflateinit functions, and have no error propagation. |
55 | | */ |
56 | | #define FUNCTABLE_INIT_ABORT \ |
57 | 0 | if (init_functable()) { \ |
58 | 0 | fprintf(stderr, "Zlib-ng functable failed initialization!\n"); \ |
59 | 0 | abort(); \ |
60 | 0 | }; |
61 | | |
62 | | // Empty stub, used when functable has already been initialized |
63 | 8.68k | static int force_init_empty(void) { |
64 | 8.68k | return 0; |
65 | 8.68k | } |
66 | | |
67 | | /* Functable initialization. |
68 | | * Selects the best available optimized functions appropriate for the runtime cpu. |
69 | | */ |
70 | 1 | static int init_functable(void) { |
71 | 1 | struct functable_s ft; |
72 | 1 | struct cpu_features cf; |
73 | | |
74 | 1 | memset(&ft, 0, sizeof(struct functable_s)); |
75 | 1 | cpu_check_features(&cf); |
76 | 1 | ft.force_init = &force_init_empty; |
77 | | |
78 | | // Set up generic C code fallbacks |
79 | 1 | #ifndef WITH_ALL_FALLBACKS |
80 | 1 | # if defined(ARCH_X86) && defined(ARCH_64BIT) && defined(X86_SSE2) |
81 | | // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available. |
82 | 1 | ft.adler32 = &adler32_c; |
83 | 1 | ft.adler32_copy = &adler32_copy_c; |
84 | 1 | ft.crc32 = &crc32_braid; |
85 | 1 | ft.crc32_copy = &crc32_copy_braid; |
86 | 1 | # endif |
87 | | #else // WITH_ALL_FALLBACKS |
88 | | ft.adler32 = &adler32_c; |
89 | | ft.adler32_copy = &adler32_copy_c; |
90 | | ft.chunkmemset_safe = &chunkmemset_safe_c; |
91 | | ft.compare256 = &compare256_c; |
92 | | ft.crc32 = &crc32_braid; |
93 | | ft.crc32_copy = &crc32_copy_braid; |
94 | | ft.inflate_fast = &inflate_fast_c; |
95 | | ft.longest_match = &longest_match_c; |
96 | | ft.longest_match_slow = &longest_match_slow_c; |
97 | | ft.slide_hash = &slide_hash_c; |
98 | | #endif |
99 | | |
100 | | // Select arch-optimized functions |
101 | 1 | #ifdef WITH_OPTIM |
102 | | |
103 | | // Chorba generic C fallback |
104 | 1 | #ifndef WITHOUT_CHORBA |
105 | 1 | ft.crc32 = &crc32_chorba; |
106 | 1 | ft.crc32_copy = &crc32_copy_chorba; |
107 | 1 | #endif |
108 | | |
109 | | // X86 - SSE2 |
110 | 1 | #ifdef X86_SSE2 |
111 | | # ifdef ARCH_32BIT |
112 | | if (cf.x86.has_sse2) |
113 | | # endif |
114 | 1 | { |
115 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_sse2; |
116 | 1 | ft.compare256 = &compare256_sse2; |
117 | 1 | # if !defined(WITHOUT_CHORBA_SSE) |
118 | 1 | ft.crc32 = &crc32_chorba_sse2; |
119 | 1 | ft.crc32_copy = &crc32_copy_chorba_sse2; |
120 | 1 | # endif |
121 | 1 | ft.inflate_fast = &inflate_fast_sse2; |
122 | 1 | ft.longest_match = &longest_match_sse2; |
123 | 1 | ft.longest_match_slow = &longest_match_slow_sse2; |
124 | 1 | ft.slide_hash = &slide_hash_sse2; |
125 | 1 | } |
126 | 1 | #endif |
127 | | // X86 - SSSE3 |
128 | 1 | #ifdef X86_SSSE3 |
129 | 1 | if (cf.x86.has_ssse3) { |
130 | 1 | ft.adler32 = &adler32_ssse3; |
131 | 1 | ft.adler32_copy = &adler32_copy_ssse3; |
132 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_ssse3; |
133 | 1 | ft.inflate_fast = &inflate_fast_ssse3; |
134 | 1 | } |
135 | 1 | #endif |
136 | | |
137 | | // X86 - SSE4.1 |
138 | 1 | #if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) |
139 | 1 | if (cf.x86.has_sse41) { |
140 | 1 | ft.crc32 = &crc32_chorba_sse41; |
141 | 1 | ft.crc32_copy = &crc32_copy_chorba_sse41; |
142 | 1 | } |
143 | 1 | #endif |
144 | | |
145 | | // X86 - SSE4.2 |
146 | 1 | #ifdef X86_SSE42 |
147 | 1 | if (cf.x86.has_sse42) { |
148 | 1 | ft.adler32_copy = &adler32_copy_sse42; |
149 | 1 | } |
150 | 1 | #endif |
151 | | // X86 - PCLMUL |
152 | 1 | #ifdef X86_PCLMULQDQ_CRC |
153 | 1 | if (cf.x86.has_pclmulqdq) { |
154 | 1 | ft.crc32 = &crc32_pclmulqdq; |
155 | 1 | ft.crc32_copy = &crc32_copy_pclmulqdq; |
156 | 1 | } |
157 | 1 | #endif |
158 | | // X86 - AVX |
159 | 1 | #ifdef X86_AVX2 |
160 | | /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for |
161 | | * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers |
162 | | * for the shift results as an operand, eliminating several register-register moves when the original value needs |
163 | | * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ |
164 | 1 | if (cf.x86.has_avx2 && cf.x86.has_bmi2) { |
165 | 1 | ft.adler32 = &adler32_avx2; |
166 | 1 | ft.adler32_copy = &adler32_copy_avx2; |
167 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_avx2; |
168 | 1 | ft.compare256 = &compare256_avx2; |
169 | 1 | ft.inflate_fast = &inflate_fast_avx2; |
170 | 1 | ft.longest_match = &longest_match_avx2; |
171 | 1 | ft.longest_match_slow = &longest_match_slow_avx2; |
172 | 1 | ft.slide_hash = &slide_hash_avx2; |
173 | 1 | } |
174 | 1 | #endif |
175 | | // X86 - AVX512 (F,DQ,BW,Vl) |
176 | 1 | #ifdef X86_AVX512 |
177 | 1 | if (cf.x86.has_avx512_common) { |
178 | 0 | ft.adler32 = &adler32_avx512; |
179 | 0 | ft.adler32_copy = &adler32_copy_avx512; |
180 | 0 | ft.chunkmemset_safe = &chunkmemset_safe_avx512; |
181 | 0 | ft.compare256 = &compare256_avx512; |
182 | 0 | ft.inflate_fast = &inflate_fast_avx512; |
183 | 0 | ft.longest_match = &longest_match_avx512; |
184 | 0 | ft.longest_match_slow = &longest_match_slow_avx512; |
185 | 0 | } |
186 | 1 | #endif |
187 | 1 | #ifdef X86_AVX512VNNI |
188 | 1 | if (cf.x86.has_avx512vnni) { |
189 | 0 | ft.adler32 = &adler32_avx512_vnni; |
190 | 0 | ft.adler32_copy = &adler32_copy_avx512_vnni; |
191 | 0 | } |
192 | 1 | #endif |
193 | | // X86 - VPCLMULQDQ |
194 | 1 | #ifdef X86_VPCLMULQDQ_CRC |
195 | 1 | if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) { |
196 | 0 | ft.crc32 = &crc32_vpclmulqdq; |
197 | 0 | ft.crc32_copy = &crc32_copy_vpclmulqdq; |
198 | 0 | } |
199 | 1 | #endif |
200 | | |
201 | | |
202 | | // ARM - SIMD |
203 | | #ifdef ARM_SIMD |
204 | | # ifndef ARM_NOCHECK_SIMD |
205 | | if (cf.arm.has_simd) |
206 | | # endif |
207 | | { |
208 | | ft.slide_hash = &slide_hash_armv6; |
209 | | } |
210 | | #endif |
211 | | // ARM - NEON |
212 | | #ifdef ARM_NEON |
213 | | # ifndef ARM_NOCHECK_NEON |
214 | | if (cf.arm.has_neon) |
215 | | # endif |
216 | | { |
217 | | ft.adler32 = &adler32_neon; |
218 | | ft.adler32_copy = &adler32_copy_neon; |
219 | | ft.chunkmemset_safe = &chunkmemset_safe_neon; |
220 | | ft.compare256 = &compare256_neon; |
221 | | ft.inflate_fast = &inflate_fast_neon; |
222 | | ft.longest_match = &longest_match_neon; |
223 | | ft.longest_match_slow = &longest_match_slow_neon; |
224 | | ft.slide_hash = &slide_hash_neon; |
225 | | } |
226 | | #endif |
227 | | // ARM - CRC32 |
228 | | #ifdef ARM_CRC32 |
229 | | if (cf.arm.has_crc32) { |
230 | | ft.crc32 = &crc32_armv8; |
231 | | ft.crc32_copy = &crc32_copy_armv8; |
232 | | } |
233 | | #endif |
234 | | // ARM - PMULL EOR3 |
235 | | #ifdef ARM_PMULL_EOR3 |
236 | | if (cf.arm.has_crc32 && cf.arm.has_pmull && cf.arm.has_eor3 && cf.arm.has_fast_pmull) { |
237 | | ft.crc32 = &crc32_armv8_pmull_eor3; |
238 | | ft.crc32_copy = &crc32_copy_armv8_pmull_eor3; |
239 | | } |
240 | | #endif |
241 | | |
242 | | // Power - VMX |
243 | | #ifdef PPC_VMX |
244 | | if (cf.power.has_altivec) { |
245 | | ft.adler32 = &adler32_vmx; |
246 | | ft.adler32_copy = &adler32_copy_vmx; |
247 | | ft.slide_hash = &slide_hash_vmx; |
248 | | } |
249 | | #endif |
250 | | // Power8 - VSX |
251 | | #ifdef POWER8_VSX |
252 | | if (cf.power.has_arch_2_07) { |
253 | | ft.adler32 = &adler32_power8; |
254 | | ft.adler32_copy = &adler32_copy_power8; |
255 | | ft.chunkmemset_safe = &chunkmemset_safe_power8; |
256 | | ft.inflate_fast = &inflate_fast_power8; |
257 | | ft.slide_hash = &slide_hash_power8; |
258 | | } |
259 | | #endif |
260 | | #ifdef POWER8_VSX_CRC32 |
261 | | if (cf.power.has_arch_2_07) { |
262 | | ft.crc32 = &crc32_power8; |
263 | | ft.crc32_copy = &crc32_copy_power8; |
264 | | } |
265 | | #endif |
266 | | // Power9 |
267 | | #ifdef POWER9 |
268 | | if (cf.power.has_arch_3_00) { |
269 | | ft.compare256 = &compare256_power9; |
270 | | ft.longest_match = &longest_match_power9; |
271 | | ft.longest_match_slow = &longest_match_slow_power9; |
272 | | } |
273 | | #endif |
274 | | |
275 | | |
276 | | // RISCV - RVV |
277 | | #ifdef RISCV_RVV |
278 | | if (cf.riscv.has_rvv) { |
279 | | ft.adler32 = &adler32_rvv; |
280 | | ft.adler32_copy = &adler32_copy_rvv; |
281 | | ft.chunkmemset_safe = &chunkmemset_safe_rvv; |
282 | | ft.compare256 = &compare256_rvv; |
283 | | ft.inflate_fast = &inflate_fast_rvv; |
284 | | ft.longest_match = &longest_match_rvv; |
285 | | ft.longest_match_slow = &longest_match_slow_rvv; |
286 | | ft.slide_hash = &slide_hash_rvv; |
287 | | } |
288 | | #endif |
289 | | |
290 | | // RISCV - ZBC |
291 | | #ifdef RISCV_CRC32_ZBC |
292 | | if (cf.riscv.has_zbc) { |
293 | | ft.crc32 = &crc32_riscv64_zbc; |
294 | | ft.crc32_copy = &crc32_copy_riscv64_zbc; |
295 | | } |
296 | | #endif |
297 | | |
298 | | // S390 |
299 | | #ifdef S390_CRC32_VX |
300 | | if (cf.s390.has_vx) { |
301 | | ft.crc32 = crc32_s390_vx; |
302 | | ft.crc32_copy = crc32_copy_s390_vx; |
303 | | } |
304 | | #endif |
305 | | |
306 | | // LOONGARCH |
307 | | #ifdef LOONGARCH_CRC |
308 | | if (cf.loongarch.has_crc) { |
309 | | ft.crc32 = crc32_loongarch64; |
310 | | ft.crc32_copy = crc32_copy_loongarch64; |
311 | | } |
312 | | #endif |
313 | | #ifdef LOONGARCH_LSX |
314 | | if (cf.loongarch.has_lsx) { |
315 | | ft.adler32 = &adler32_lsx; |
316 | | ft.adler32_copy = &adler32_copy_lsx; |
317 | | ft.chunkmemset_safe = &chunkmemset_safe_lsx; |
318 | | ft.compare256 = &compare256_lsx; |
319 | | ft.inflate_fast = &inflate_fast_lsx; |
320 | | ft.longest_match = &longest_match_lsx; |
321 | | ft.longest_match_slow = &longest_match_slow_lsx; |
322 | | ft.slide_hash = slide_hash_lsx; |
323 | | } |
324 | | #endif |
325 | | #ifdef LOONGARCH_LASX |
326 | | if (cf.loongarch.has_lasx) { |
327 | | ft.adler32 = &adler32_lasx; |
328 | | ft.adler32_copy = &adler32_copy_lasx; |
329 | | ft.chunkmemset_safe = &chunkmemset_safe_lasx; |
330 | | ft.compare256 = &compare256_lasx; |
331 | | ft.inflate_fast = &inflate_fast_lasx; |
332 | | ft.longest_match = &longest_match_lasx; |
333 | | ft.longest_match_slow = &longest_match_slow_lasx; |
334 | | ft.slide_hash = slide_hash_lasx; |
335 | | } |
336 | | #endif |
337 | | |
338 | 1 | #endif // WITH_OPTIM |
339 | | |
340 | | // Assign function pointers individually for atomic operation |
341 | 1 | FUNCTABLE_ASSIGN(ft, force_init); |
342 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, adler32); |
343 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, adler32_copy); |
344 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, chunkmemset_safe); |
345 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, compare256); |
346 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, crc32); |
347 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, crc32_copy); |
348 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, inflate_fast); |
349 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, longest_match); |
350 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, longest_match_slow); |
351 | 1 | FUNCTABLE_VERIFY_ASSIGN(ft, slide_hash); |
352 | | |
353 | | // Memory barrier for weak memory order CPUs |
354 | 1 | FUNCTABLE_BARRIER(); |
355 | | |
356 | 1 | return Z_OK; |
357 | 1 | } |
358 | | |
359 | | /* stub functions */ |
360 | 1 | static int force_init_stub(void) { |
361 | 1 | return init_functable(); |
362 | 1 | } |
363 | | |
364 | 0 | static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { |
365 | 0 | FUNCTABLE_INIT_ABORT; |
366 | 0 | return functable.adler32(adler, buf, len); |
367 | 0 | } |
368 | | |
369 | 0 | static uint32_t adler32_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { |
370 | 0 | FUNCTABLE_INIT_ABORT; |
371 | 0 | return functable.adler32_copy(adler, dst, src, len); |
372 | 0 | } |
373 | | |
374 | 0 | static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) { |
375 | 0 | FUNCTABLE_INIT_ABORT; |
376 | 0 | return functable.chunkmemset_safe(out, from, len, left); |
377 | 0 | } |
378 | | |
379 | 0 | static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { |
380 | 0 | FUNCTABLE_INIT_ABORT; |
381 | 0 | return functable.compare256(src0, src1); |
382 | 0 | } |
383 | | |
384 | 0 | static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { |
385 | 0 | FUNCTABLE_INIT_ABORT; |
386 | 0 | return functable.crc32(crc, buf, len); |
387 | 0 | } |
388 | | |
389 | 0 | static uint32_t crc32_copy_stub(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) { |
390 | 0 | FUNCTABLE_INIT_ABORT; |
391 | 0 | return functable.crc32_copy(crc, dst, src, len); |
392 | 0 | } |
393 | | |
394 | 0 | static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { |
395 | 0 | FUNCTABLE_INIT_ABORT; |
396 | 0 | functable.inflate_fast(strm, start); |
397 | 0 | } |
398 | | |
399 | 0 | static uint32_t longest_match_stub(deflate_state* const s, uint32_t cur_match) { |
400 | 0 | FUNCTABLE_INIT_ABORT; |
401 | 0 | return functable.longest_match(s, cur_match); |
402 | 0 | } |
403 | | |
404 | 0 | static uint32_t longest_match_slow_stub(deflate_state* const s, uint32_t cur_match) { |
405 | 0 | FUNCTABLE_INIT_ABORT; |
406 | 0 | return functable.longest_match_slow(s, cur_match); |
407 | 0 | } |
408 | | |
409 | 0 | static void slide_hash_stub(deflate_state* s) { |
410 | 0 | FUNCTABLE_INIT_ABORT; |
411 | 0 | functable.slide_hash(s); |
412 | 0 | } |
413 | | |
414 | | /* functable init */ |
415 | | Z_INTERNAL struct functable_s functable = { |
416 | | force_init_stub, |
417 | | adler32_stub, |
418 | | adler32_copy_stub, |
419 | | chunkmemset_safe_stub, |
420 | | compare256_stub, |
421 | | crc32_stub, |
422 | | crc32_copy_stub, |
423 | | inflate_fast_stub, |
424 | | longest_match_stub, |
425 | | longest_match_slow_stub, |
426 | | slide_hash_stub, |
427 | | }; |
428 | | |
429 | | #endif |