Line | Count | Source (jump to first uncovered line) |
1 | | /* functable.c -- Choose relevant optimized functions at runtime |
2 | | * Copyright (C) 2017 Hans Kristian Rosbach |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | #ifndef DISABLE_RUNTIME_CPU_DETECTION |
6 | | |
7 | | #include "zbuild.h" |
8 | | |
9 | | #if defined(_MSC_VER) |
10 | | # include <intrin.h> |
11 | | #endif |
12 | | |
13 | | #include "functable.h" |
14 | | #include "cpu_features.h" |
15 | | #include "arch_functions.h" |
16 | | |
17 | | /* Platform has pointer size atomic store */ |
18 | | #if defined(__GNUC__) || defined(__clang__) |
19 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
20 | 15 | __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) |
21 | 1 | # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
22 | | #elif defined(_MSC_VER) |
23 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
24 | | _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) |
25 | | # if defined(_M_ARM) || defined(_M_ARM64) |
26 | | # define FUNCTABLE_BARRIER() do { \ |
27 | | _ReadWriteBarrier(); \ |
28 | | __dmb(0xB); /* _ARM_BARRIER_ISH */ \ |
29 | | _ReadWriteBarrier(); \ |
30 | | } while (0) |
31 | | # else |
32 | | # define FUNCTABLE_BARRIER() _ReadWriteBarrier() |
33 | | # endif |
34 | | #else |
35 | | # warning Unable to detect atomic intrinsic support. |
36 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
37 | | *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) |
38 | | # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) |
39 | | #endif |
40 | | |
41 | 12.2k | static void force_init_empty(void) { |
42 | | // empty |
43 | 12.2k | } |
44 | | |
45 | 1 | static void init_functable(void) { |
46 | 1 | struct functable_s ft; |
47 | 1 | struct cpu_features cf; |
48 | | |
49 | 1 | cpu_check_features(&cf); |
50 | | |
51 | | // Generic code |
52 | 1 | ft.force_init = &force_init_empty; |
53 | 1 | ft.adler32 = &adler32_c; |
54 | 1 | ft.adler32_fold_copy = &adler32_fold_copy_c; |
55 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_c; |
56 | 1 | ft.chunksize = &chunksize_c; |
57 | 1 | ft.crc32 = &crc32_c; |
58 | 1 | ft.crc32_fold = &crc32_fold_c; |
59 | 1 | ft.crc32_fold_copy = &crc32_fold_copy_c; |
60 | 1 | ft.crc32_fold_final = &crc32_fold_final_c; |
61 | 1 | ft.crc32_fold_reset = &crc32_fold_reset_c; |
62 | 1 | ft.inflate_fast = &inflate_fast_c; |
63 | 1 | ft.slide_hash = &slide_hash_c; |
64 | 1 | ft.longest_match = &longest_match_c; |
65 | 1 | ft.longest_match_slow = &longest_match_slow_c; |
66 | 1 | ft.compare256 = &compare256_c; |
67 | | |
68 | | // Select arch-optimized functions |
69 | | |
70 | | // X86 - SSE2 |
71 | 1 | #ifdef X86_SSE2 |
72 | | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
73 | | if (cf.x86.has_sse2) |
74 | | # endif |
75 | 1 | { |
76 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_sse2; |
77 | 1 | ft.chunksize = &chunksize_sse2; |
78 | 1 | #if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) |
79 | 1 | ft.crc32 = &crc32_chorba_sse2; |
80 | 1 | #endif |
81 | 1 | ft.inflate_fast = &inflate_fast_sse2; |
82 | 1 | ft.slide_hash = &slide_hash_sse2; |
83 | 1 | # ifdef HAVE_BUILTIN_CTZ |
84 | 1 | ft.compare256 = &compare256_sse2; |
85 | 1 | ft.longest_match = &longest_match_sse2; |
86 | 1 | ft.longest_match_slow = &longest_match_slow_sse2; |
87 | 1 | # endif |
88 | 1 | } |
89 | 1 | #endif |
90 | | // X86 - SSSE3 |
91 | 1 | #ifdef X86_SSSE3 |
92 | 1 | if (cf.x86.has_ssse3) { |
93 | 1 | ft.adler32 = &adler32_ssse3; |
94 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_ssse3; |
95 | 1 | ft.inflate_fast = &inflate_fast_ssse3; |
96 | 1 | } |
97 | 1 | #endif |
98 | | |
99 | | // X86 - SSE4.1 |
100 | 1 | #ifdef X86_SSE41 |
101 | 1 | if (cf.x86.has_sse41) { |
102 | 1 | #if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) |
103 | 1 | ft.crc32 = &crc32_chorba_sse41; |
104 | 1 | #endif |
105 | 1 | } |
106 | 1 | #endif |
107 | | |
108 | | // X86 - SSE4.2 |
109 | 1 | #ifdef X86_SSE42 |
110 | 1 | if (cf.x86.has_sse42) { |
111 | 1 | ft.adler32_fold_copy = &adler32_fold_copy_sse42; |
112 | 1 | } |
113 | 1 | #endif |
114 | | // X86 - PCLMUL |
115 | 1 | #ifdef X86_PCLMULQDQ_CRC |
116 | 1 | if (cf.x86.has_pclmulqdq) { |
117 | 1 | ft.crc32 = &crc32_pclmulqdq; |
118 | 1 | ft.crc32_fold = &crc32_fold_pclmulqdq; |
119 | 1 | ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy; |
120 | 1 | ft.crc32_fold_final = &crc32_fold_pclmulqdq_final; |
121 | 1 | ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset; |
122 | 1 | } |
123 | 1 | #endif |
124 | | // X86 - AVX |
125 | 1 | #ifdef X86_AVX2 |
126 | | /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for |
127 | | * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers |
128 | | * for the shift results as an operand, eliminating several register-register moves when the original value needs |
129 | | * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ |
130 | 1 | if (cf.x86.has_avx2 && cf.x86.has_bmi2) { |
131 | 1 | ft.adler32 = &adler32_avx2; |
132 | 1 | ft.adler32_fold_copy = &adler32_fold_copy_avx2; |
133 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_avx2; |
134 | 1 | ft.chunksize = &chunksize_avx2; |
135 | 1 | ft.inflate_fast = &inflate_fast_avx2; |
136 | 1 | ft.slide_hash = &slide_hash_avx2; |
137 | 1 | # ifdef HAVE_BUILTIN_CTZ |
138 | 1 | ft.compare256 = &compare256_avx2; |
139 | 1 | ft.longest_match = &longest_match_avx2; |
140 | 1 | ft.longest_match_slow = &longest_match_slow_avx2; |
141 | 1 | # endif |
142 | 1 | } |
143 | 1 | #endif |
144 | | // X86 - AVX512 (F,DQ,BW,Vl) |
145 | 1 | #ifdef X86_AVX512 |
146 | 1 | if (cf.x86.has_avx512_common) { |
147 | 0 | ft.adler32 = &adler32_avx512; |
148 | 0 | ft.adler32_fold_copy = &adler32_fold_copy_avx512; |
149 | 0 | ft.chunkmemset_safe = &chunkmemset_safe_avx512; |
150 | 0 | ft.chunksize = &chunksize_avx512; |
151 | 0 | ft.inflate_fast = &inflate_fast_avx512; |
152 | 0 | # ifdef HAVE_BUILTIN_CTZLL |
153 | 0 | ft.compare256 = &compare256_avx512; |
154 | 0 | ft.longest_match = &longest_match_avx512; |
155 | 0 | ft.longest_match_slow = &longest_match_slow_avx512; |
156 | 0 | # endif |
157 | 0 | } |
158 | 1 | #endif |
159 | 1 | #ifdef X86_AVX512VNNI |
160 | 1 | if (cf.x86.has_avx512vnni) { |
161 | 0 | ft.adler32 = &adler32_avx512_vnni; |
162 | 0 | ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; |
163 | 0 | } |
164 | 1 | #endif |
165 | | // X86 - VPCLMULQDQ |
166 | 1 | #ifdef X86_VPCLMULQDQ_CRC |
167 | 1 | if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) { |
168 | 0 | ft.crc32 = &crc32_vpclmulqdq; |
169 | 0 | ft.crc32_fold = &crc32_fold_vpclmulqdq; |
170 | 0 | ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy; |
171 | 0 | ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final; |
172 | 0 | ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset; |
173 | 0 | } |
174 | 1 | #endif |
175 | | |
176 | | |
177 | | // ARM - SIMD |
178 | | #ifdef ARM_SIMD |
179 | | # ifndef ARM_NOCHECK_SIMD |
180 | | if (cf.arm.has_simd) |
181 | | # endif |
182 | | { |
183 | | ft.slide_hash = &slide_hash_armv6; |
184 | | } |
185 | | #endif |
186 | | // ARM - NEON |
187 | | #ifdef ARM_NEON |
188 | | # ifndef ARM_NOCHECK_NEON |
189 | | if (cf.arm.has_neon) |
190 | | # endif |
191 | | { |
192 | | ft.adler32 = &adler32_neon; |
193 | | ft.adler32_fold_copy = &adler32_fold_copy_neon; |
194 | | ft.chunkmemset_safe = &chunkmemset_safe_neon; |
195 | | ft.chunksize = &chunksize_neon; |
196 | | ft.inflate_fast = &inflate_fast_neon; |
197 | | ft.slide_hash = &slide_hash_neon; |
198 | | # ifdef HAVE_BUILTIN_CTZLL |
199 | | ft.compare256 = &compare256_neon; |
200 | | ft.longest_match = &longest_match_neon; |
201 | | ft.longest_match_slow = &longest_match_slow_neon; |
202 | | # endif |
203 | | } |
204 | | #endif |
205 | | // ARM - CRC32 |
206 | | #ifdef ARM_CRC32 |
207 | | if (cf.arm.has_crc32) { |
208 | | ft.crc32 = &crc32_armv8; |
209 | | } |
210 | | #endif |
211 | | |
212 | | |
213 | | // Power - VMX |
214 | | #ifdef PPC_VMX |
215 | | if (cf.power.has_altivec) { |
216 | | ft.adler32 = &adler32_vmx; |
217 | | ft.slide_hash = &slide_hash_vmx; |
218 | | } |
219 | | #endif |
220 | | // Power8 - VSX |
221 | | #ifdef POWER8_VSX |
222 | | if (cf.power.has_arch_2_07) { |
223 | | ft.adler32 = &adler32_power8; |
224 | | ft.chunkmemset_safe = &chunkmemset_safe_power8; |
225 | | ft.chunksize = &chunksize_power8; |
226 | | ft.inflate_fast = &inflate_fast_power8; |
227 | | ft.slide_hash = &slide_hash_power8; |
228 | | } |
229 | | #endif |
230 | | #ifdef POWER8_VSX_CRC32 |
231 | | if (cf.power.has_arch_2_07) |
232 | | ft.crc32 = &crc32_power8; |
233 | | #endif |
234 | | // Power9 |
235 | | #ifdef POWER9 |
236 | | if (cf.power.has_arch_3_00) { |
237 | | ft.compare256 = &compare256_power9; |
238 | | ft.longest_match = &longest_match_power9; |
239 | | ft.longest_match_slow = &longest_match_slow_power9; |
240 | | } |
241 | | #endif |
242 | | |
243 | | |
244 | | // RISCV - RVV |
245 | | #ifdef RISCV_RVV |
246 | | if (cf.riscv.has_rvv) { |
247 | | ft.adler32 = &adler32_rvv; |
248 | | ft.adler32_fold_copy = &adler32_fold_copy_rvv; |
249 | | ft.chunkmemset_safe = &chunkmemset_safe_rvv; |
250 | | ft.chunksize = &chunksize_rvv; |
251 | | ft.compare256 = &compare256_rvv; |
252 | | ft.inflate_fast = &inflate_fast_rvv; |
253 | | ft.longest_match = &longest_match_rvv; |
254 | | ft.longest_match_slow = &longest_match_slow_rvv; |
255 | | ft.slide_hash = &slide_hash_rvv; |
256 | | } |
257 | | #endif |
258 | | |
259 | | // RISCV - ZBC |
260 | | #ifdef RISCV_CRC32_ZBC |
261 | | if (cf.riscv.has_zbc) { |
262 | | ft.crc32 = &crc32_riscv64_zbc; |
263 | | } |
264 | | #endif |
265 | | |
266 | | // S390 |
267 | | #ifdef S390_CRC32_VX |
268 | | if (cf.s390.has_vx) |
269 | | ft.crc32 = crc32_s390_vx; |
270 | | #endif |
271 | | |
272 | | // LOONGARCH |
273 | | #ifdef LOONGARCH_CRC |
274 | | if (cf.loongarch.has_crc) { |
275 | | ft.crc32 = crc32_loongarch64; |
276 | | ft.crc32_fold = &crc32_fold_loongarch64; |
277 | | ft.crc32_fold_copy = &crc32_fold_copy_loongarch64; |
278 | | } |
279 | | #endif |
280 | | #ifdef LOONGARCH_LSX |
281 | | if (cf.loongarch.has_lsx) { |
282 | | ft.adler32 = &adler32_lsx; |
283 | | ft.adler32_fold_copy = &adler32_fold_copy_lsx; |
284 | | ft.slide_hash = slide_hash_lsx; |
285 | | # ifdef HAVE_BUILTIN_CTZ |
286 | | ft.compare256 = &compare256_lsx; |
287 | | ft.longest_match = &longest_match_lsx; |
288 | | ft.longest_match_slow = &longest_match_slow_lsx; |
289 | | # endif |
290 | | ft.chunksize = &chunksize_lsx; |
291 | | ft.chunkmemset_safe = &chunkmemset_safe_lsx; |
292 | | ft.inflate_fast = &inflate_fast_lsx; |
293 | | } |
294 | | #endif |
295 | | #ifdef LOONGARCH_LASX |
296 | | if (cf.loongarch.has_lasx) { |
297 | | ft.adler32 = &adler32_lasx; |
298 | | ft.adler32_fold_copy = &adler32_fold_copy_lasx; |
299 | | ft.slide_hash = slide_hash_lasx; |
300 | | # ifdef HAVE_BUILTIN_CTZ |
301 | | ft.compare256 = &compare256_lasx; |
302 | | ft.longest_match = &longest_match_lasx; |
303 | | ft.longest_match_slow = &longest_match_slow_lasx; |
304 | | # endif |
305 | | ft.chunksize = &chunksize_lasx; |
306 | | ft.chunkmemset_safe = &chunkmemset_safe_lasx; |
307 | | ft.inflate_fast = &inflate_fast_lasx; |
308 | | } |
309 | | #endif |
310 | | |
311 | | // Assign function pointers individually for atomic operation |
312 | 1 | FUNCTABLE_ASSIGN(ft, force_init); |
313 | 1 | FUNCTABLE_ASSIGN(ft, adler32); |
314 | 1 | FUNCTABLE_ASSIGN(ft, adler32_fold_copy); |
315 | 1 | FUNCTABLE_ASSIGN(ft, chunkmemset_safe); |
316 | 1 | FUNCTABLE_ASSIGN(ft, chunksize); |
317 | 1 | FUNCTABLE_ASSIGN(ft, compare256); |
318 | 1 | FUNCTABLE_ASSIGN(ft, crc32); |
319 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold); |
320 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold_copy); |
321 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold_final); |
322 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold_reset); |
323 | 1 | FUNCTABLE_ASSIGN(ft, inflate_fast); |
324 | 1 | FUNCTABLE_ASSIGN(ft, longest_match); |
325 | 1 | FUNCTABLE_ASSIGN(ft, longest_match_slow); |
326 | 1 | FUNCTABLE_ASSIGN(ft, slide_hash); |
327 | | |
328 | | // Memory barrier for weak memory order CPUs |
329 | 1 | FUNCTABLE_BARRIER(); |
330 | 1 | } |
331 | | |
332 | | /* stub functions */ |
333 | 1 | static void force_init_stub(void) { |
334 | 1 | init_functable(); |
335 | 1 | } |
336 | | |
337 | 0 | static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { |
338 | 0 | init_functable(); |
339 | 0 | return functable.adler32(adler, buf, len); |
340 | 0 | } |
341 | | |
342 | 0 | static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { |
343 | 0 | init_functable(); |
344 | 0 | return functable.adler32_fold_copy(adler, dst, src, len); |
345 | 0 | } |
346 | | |
347 | 0 | static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) { |
348 | 0 | init_functable(); |
349 | 0 | return functable.chunkmemset_safe(out, from, len, left); |
350 | 0 | } |
351 | | |
352 | 0 | static uint32_t chunksize_stub(void) { |
353 | 0 | init_functable(); |
354 | 0 | return functable.chunksize(); |
355 | 0 | } |
356 | | |
357 | 0 | static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { |
358 | 0 | init_functable(); |
359 | 0 | return functable.compare256(src0, src1); |
360 | 0 | } |
361 | | |
362 | 0 | static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { |
363 | 0 | init_functable(); |
364 | 0 | return functable.crc32(crc, buf, len); |
365 | 0 | } |
366 | | |
367 | 0 | static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) { |
368 | 0 | init_functable(); |
369 | 0 | functable.crc32_fold(crc, src, len, init_crc); |
370 | 0 | } |
371 | | |
372 | 0 | static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) { |
373 | 0 | init_functable(); |
374 | 0 | functable.crc32_fold_copy(crc, dst, src, len); |
375 | 0 | } |
376 | | |
377 | 0 | static uint32_t crc32_fold_final_stub(crc32_fold* crc) { |
378 | 0 | init_functable(); |
379 | 0 | return functable.crc32_fold_final(crc); |
380 | 0 | } |
381 | | |
382 | 0 | static uint32_t crc32_fold_reset_stub(crc32_fold* crc) { |
383 | 0 | init_functable(); |
384 | 0 | return functable.crc32_fold_reset(crc); |
385 | 0 | } |
386 | | |
387 | 0 | static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { |
388 | 0 | init_functable(); |
389 | 0 | functable.inflate_fast(strm, start); |
390 | 0 | } |
391 | | |
392 | 0 | static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) { |
393 | 0 | init_functable(); |
394 | 0 | return functable.longest_match(s, cur_match); |
395 | 0 | } |
396 | | |
397 | 0 | static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) { |
398 | 0 | init_functable(); |
399 | 0 | return functable.longest_match_slow(s, cur_match); |
400 | 0 | } |
401 | | |
402 | 0 | static void slide_hash_stub(deflate_state* s) { |
403 | 0 | init_functable(); |
404 | 0 | functable.slide_hash(s); |
405 | 0 | } |
406 | | |
407 | | /* functable init */ |
408 | | Z_INTERNAL struct functable_s functable = { |
409 | | force_init_stub, |
410 | | adler32_stub, |
411 | | adler32_fold_copy_stub, |
412 | | chunkmemset_safe_stub, |
413 | | chunksize_stub, |
414 | | compare256_stub, |
415 | | crc32_stub, |
416 | | crc32_fold_stub, |
417 | | crc32_fold_copy_stub, |
418 | | crc32_fold_final_stub, |
419 | | crc32_fold_reset_stub, |
420 | | inflate_fast_stub, |
421 | | longest_match_stub, |
422 | | longest_match_slow_stub, |
423 | | slide_hash_stub, |
424 | | }; |
425 | | |
426 | | #endif |