Line | Count | Source |
1 | | /* functable.c -- Choose relevant optimized functions at runtime |
2 | | * Copyright (C) 2017 Hans Kristian Rosbach |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | #ifndef DISABLE_RUNTIME_CPU_DETECTION |
6 | | |
7 | | #include "zbuild.h" |
8 | | |
9 | | #if defined(_MSC_VER) |
10 | | # include <intrin.h> |
11 | | #endif |
12 | | |
13 | | #include "functable.h" |
14 | | #include "cpu_features.h" |
15 | | #include "arch_functions.h" |
16 | | |
17 | | /* Platform has pointer size atomic store */ |
18 | | #if defined(__GNUC__) || defined(__clang__) |
19 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
20 | 14 | __atomic_store(&(functable.FUNC_NAME), &(VAR.FUNC_NAME), __ATOMIC_SEQ_CST) |
21 | 1 | # define FUNCTABLE_BARRIER() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
22 | | #elif defined(_MSC_VER) |
23 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
24 | | _InterlockedExchangePointer((void * volatile *)&(functable.FUNC_NAME), (void *)(VAR.FUNC_NAME)) |
25 | | # if defined(_M_ARM) || defined(_M_ARM64) |
26 | | # define FUNCTABLE_BARRIER() do { \ |
27 | | _ReadWriteBarrier(); \ |
28 | | __dmb(0xB); /* _ARM_BARRIER_ISH */ \ |
29 | | _ReadWriteBarrier(); \ |
30 | | } while (0) |
31 | | # else |
32 | | # define FUNCTABLE_BARRIER() _ReadWriteBarrier() |
33 | | # endif |
34 | | #else |
35 | | # warning Unable to detect atomic intrinsic support. |
36 | | # define FUNCTABLE_ASSIGN(VAR, FUNC_NAME) \ |
37 | | *((void * volatile *)&(functable.FUNC_NAME)) = (void *)(VAR.FUNC_NAME) |
38 | | # define FUNCTABLE_BARRIER() do { /* Empty */ } while (0) |
39 | | #endif |
40 | | |
41 | 2.91k | static void force_init_empty(void) { |
42 | | // empty |
43 | 2.91k | } |
44 | | |
45 | 1 | static void init_functable(void) { |
46 | 1 | struct functable_s ft; |
47 | 1 | struct cpu_features cf; |
48 | | |
49 | 1 | cpu_check_features(&cf); |
50 | | |
51 | | // Generic code |
52 | 1 | ft.force_init = &force_init_empty; |
53 | 1 | ft.adler32 = &adler32_c; |
54 | 1 | ft.adler32_fold_copy = &adler32_fold_copy_c; |
55 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_c; |
56 | 1 | ft.crc32 = &crc32_c; |
57 | 1 | ft.crc32_fold = &crc32_fold_c; |
58 | 1 | ft.crc32_fold_copy = &crc32_fold_copy_c; |
59 | 1 | ft.crc32_fold_final = &crc32_fold_final_c; |
60 | 1 | ft.crc32_fold_reset = &crc32_fold_reset_c; |
61 | 1 | ft.inflate_fast = &inflate_fast_c; |
62 | 1 | ft.slide_hash = &slide_hash_c; |
63 | 1 | ft.longest_match = &longest_match_c; |
64 | 1 | ft.longest_match_slow = &longest_match_slow_c; |
65 | 1 | ft.compare256 = &compare256_c; |
66 | | |
67 | | // Select arch-optimized functions |
68 | | |
69 | | // X86 - SSE2 |
70 | 1 | #ifdef X86_SSE2 |
71 | | # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) |
72 | | if (cf.x86.has_sse2) |
73 | | # endif |
74 | 1 | { |
75 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_sse2; |
76 | 1 | #if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) |
77 | 1 | ft.crc32 = &crc32_chorba_sse2; |
78 | 1 | #endif |
79 | 1 | ft.inflate_fast = &inflate_fast_sse2; |
80 | 1 | ft.slide_hash = &slide_hash_sse2; |
81 | 1 | # ifdef HAVE_BUILTIN_CTZ |
82 | 1 | ft.compare256 = &compare256_sse2; |
83 | 1 | ft.longest_match = &longest_match_sse2; |
84 | 1 | ft.longest_match_slow = &longest_match_slow_sse2; |
85 | 1 | # endif |
86 | 1 | } |
87 | 1 | #endif |
88 | | // X86 - SSSE3 |
89 | 1 | #ifdef X86_SSSE3 |
90 | 1 | if (cf.x86.has_ssse3) { |
91 | 1 | ft.adler32 = &adler32_ssse3; |
92 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_ssse3; |
93 | 1 | ft.inflate_fast = &inflate_fast_ssse3; |
94 | 1 | } |
95 | 1 | #endif |
96 | | |
97 | | // X86 - SSE4.1 |
98 | 1 | #ifdef X86_SSE41 |
99 | 1 | if (cf.x86.has_sse41) { |
100 | 1 | #if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) |
101 | 1 | ft.crc32 = &crc32_chorba_sse41; |
102 | 1 | #endif |
103 | 1 | } |
104 | 1 | #endif |
105 | | |
106 | | // X86 - SSE4.2 |
107 | 1 | #ifdef X86_SSE42 |
108 | 1 | if (cf.x86.has_sse42) { |
109 | 1 | ft.adler32_fold_copy = &adler32_fold_copy_sse42; |
110 | 1 | } |
111 | 1 | #endif |
112 | | // X86 - PCLMUL |
113 | 1 | #ifdef X86_PCLMULQDQ_CRC |
114 | 1 | if (cf.x86.has_pclmulqdq) { |
115 | 1 | ft.crc32 = &crc32_pclmulqdq; |
116 | 1 | ft.crc32_fold = &crc32_fold_pclmulqdq; |
117 | 1 | ft.crc32_fold_copy = &crc32_fold_pclmulqdq_copy; |
118 | 1 | ft.crc32_fold_final = &crc32_fold_pclmulqdq_final; |
119 | 1 | ft.crc32_fold_reset = &crc32_fold_pclmulqdq_reset; |
120 | 1 | } |
121 | 1 | #endif |
122 | | // X86 - AVX |
123 | 1 | #ifdef X86_AVX2 |
124 | | /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for |
125 | | * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers |
126 | | * for the shift results as an operand, eliminating several register-register moves when the original value needs |
127 | | * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ |
128 | 1 | if (cf.x86.has_avx2 && cf.x86.has_bmi2) { |
129 | 1 | ft.adler32 = &adler32_avx2; |
130 | 1 | ft.adler32_fold_copy = &adler32_fold_copy_avx2; |
131 | 1 | ft.chunkmemset_safe = &chunkmemset_safe_avx2; |
132 | 1 | ft.inflate_fast = &inflate_fast_avx2; |
133 | 1 | ft.slide_hash = &slide_hash_avx2; |
134 | 1 | # ifdef HAVE_BUILTIN_CTZ |
135 | 1 | ft.compare256 = &compare256_avx2; |
136 | 1 | ft.longest_match = &longest_match_avx2; |
137 | 1 | ft.longest_match_slow = &longest_match_slow_avx2; |
138 | 1 | # endif |
139 | 1 | } |
140 | 1 | #endif |
141 | | // X86 - AVX512 (F,DQ,BW,Vl) |
142 | 1 | #ifdef X86_AVX512 |
143 | 1 | if (cf.x86.has_avx512_common) { |
144 | 0 | ft.adler32 = &adler32_avx512; |
145 | 0 | ft.adler32_fold_copy = &adler32_fold_copy_avx512; |
146 | 0 | ft.chunkmemset_safe = &chunkmemset_safe_avx512; |
147 | 0 | ft.inflate_fast = &inflate_fast_avx512; |
148 | 0 | # ifdef HAVE_BUILTIN_CTZLL |
149 | 0 | ft.compare256 = &compare256_avx512; |
150 | 0 | ft.longest_match = &longest_match_avx512; |
151 | 0 | ft.longest_match_slow = &longest_match_slow_avx512; |
152 | 0 | # endif |
153 | 0 | } |
154 | 1 | #endif |
155 | 1 | #ifdef X86_AVX512VNNI |
156 | 1 | if (cf.x86.has_avx512vnni) { |
157 | 0 | ft.adler32 = &adler32_avx512_vnni; |
158 | 0 | ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; |
159 | 0 | } |
160 | 1 | #endif |
161 | | // X86 - VPCLMULQDQ |
162 | 1 | #ifdef X86_VPCLMULQDQ_CRC |
163 | 1 | if (cf.x86.has_pclmulqdq && cf.x86.has_avx512_common && cf.x86.has_vpclmulqdq) { |
164 | 0 | ft.crc32 = &crc32_vpclmulqdq; |
165 | 0 | ft.crc32_fold = &crc32_fold_vpclmulqdq; |
166 | 0 | ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy; |
167 | 0 | ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final; |
168 | 0 | ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset; |
169 | 0 | } |
170 | 1 | #endif |
171 | | |
172 | | |
173 | | // ARM - SIMD |
174 | | #ifdef ARM_SIMD |
175 | | # ifndef ARM_NOCHECK_SIMD |
176 | | if (cf.arm.has_simd) |
177 | | # endif |
178 | | { |
179 | | ft.slide_hash = &slide_hash_armv6; |
180 | | } |
181 | | #endif |
182 | | // ARM - NEON |
183 | | #ifdef ARM_NEON |
184 | | # ifndef ARM_NOCHECK_NEON |
185 | | if (cf.arm.has_neon) |
186 | | # endif |
187 | | { |
188 | | ft.adler32 = &adler32_neon; |
189 | | ft.adler32_fold_copy = &adler32_fold_copy_neon; |
190 | | ft.chunkmemset_safe = &chunkmemset_safe_neon; |
191 | | ft.inflate_fast = &inflate_fast_neon; |
192 | | ft.slide_hash = &slide_hash_neon; |
193 | | # ifdef HAVE_BUILTIN_CTZLL |
194 | | ft.compare256 = &compare256_neon; |
195 | | ft.longest_match = &longest_match_neon; |
196 | | ft.longest_match_slow = &longest_match_slow_neon; |
197 | | # endif |
198 | | } |
199 | | #endif |
200 | | // ARM - CRC32 |
201 | | #ifdef ARM_CRC32 |
202 | | if (cf.arm.has_crc32) { |
203 | | ft.crc32 = &crc32_armv8; |
204 | | ft.crc32_fold = &crc32_fold_armv8; |
205 | | ft.crc32_fold_copy = &crc32_fold_copy_armv8; |
206 | | } |
207 | | #endif |
208 | | |
209 | | |
210 | | // Power - VMX |
211 | | #ifdef PPC_VMX |
212 | | if (cf.power.has_altivec) { |
213 | | ft.adler32 = &adler32_vmx; |
214 | | ft.slide_hash = &slide_hash_vmx; |
215 | | } |
216 | | #endif |
217 | | // Power8 - VSX |
218 | | #ifdef POWER8_VSX |
219 | | if (cf.power.has_arch_2_07) { |
220 | | ft.adler32 = &adler32_power8; |
221 | | ft.chunkmemset_safe = &chunkmemset_safe_power8; |
222 | | ft.inflate_fast = &inflate_fast_power8; |
223 | | ft.slide_hash = &slide_hash_power8; |
224 | | } |
225 | | #endif |
226 | | #ifdef POWER8_VSX_CRC32 |
227 | | if (cf.power.has_arch_2_07) |
228 | | ft.crc32 = &crc32_power8; |
229 | | #endif |
230 | | // Power9 |
231 | | #ifdef POWER9 |
232 | | if (cf.power.has_arch_3_00) { |
233 | | ft.compare256 = &compare256_power9; |
234 | | ft.longest_match = &longest_match_power9; |
235 | | ft.longest_match_slow = &longest_match_slow_power9; |
236 | | } |
237 | | #endif |
238 | | |
239 | | |
240 | | // RISCV - RVV |
241 | | #ifdef RISCV_RVV |
242 | | if (cf.riscv.has_rvv) { |
243 | | ft.adler32 = &adler32_rvv; |
244 | | ft.adler32_fold_copy = &adler32_fold_copy_rvv; |
245 | | ft.chunkmemset_safe = &chunkmemset_safe_rvv; |
246 | | ft.compare256 = &compare256_rvv; |
247 | | ft.inflate_fast = &inflate_fast_rvv; |
248 | | ft.longest_match = &longest_match_rvv; |
249 | | ft.longest_match_slow = &longest_match_slow_rvv; |
250 | | ft.slide_hash = &slide_hash_rvv; |
251 | | } |
252 | | #endif |
253 | | |
254 | | // RISCV - ZBC |
255 | | #ifdef RISCV_CRC32_ZBC |
256 | | if (cf.riscv.has_zbc) { |
257 | | ft.crc32 = &crc32_riscv64_zbc; |
258 | | } |
259 | | #endif |
260 | | |
261 | | // S390 |
262 | | #ifdef S390_CRC32_VX |
263 | | if (cf.s390.has_vx) |
264 | | ft.crc32 = crc32_s390_vx; |
265 | | #endif |
266 | | |
267 | | // LOONGARCH |
268 | | #ifdef LOONGARCH_CRC |
269 | | if (cf.loongarch.has_crc) { |
270 | | ft.crc32 = crc32_loongarch64; |
271 | | ft.crc32_fold = &crc32_fold_loongarch64; |
272 | | ft.crc32_fold_copy = &crc32_fold_copy_loongarch64; |
273 | | } |
274 | | #endif |
275 | | #ifdef LOONGARCH_LSX |
276 | | if (cf.loongarch.has_lsx) { |
277 | | ft.adler32 = &adler32_lsx; |
278 | | ft.adler32_fold_copy = &adler32_fold_copy_lsx; |
279 | | ft.slide_hash = slide_hash_lsx; |
280 | | # ifdef HAVE_BUILTIN_CTZ |
281 | | ft.compare256 = &compare256_lsx; |
282 | | ft.longest_match = &longest_match_lsx; |
283 | | ft.longest_match_slow = &longest_match_slow_lsx; |
284 | | # endif |
285 | | ft.chunkmemset_safe = &chunkmemset_safe_lsx; |
286 | | ft.inflate_fast = &inflate_fast_lsx; |
287 | | } |
288 | | #endif |
289 | | #ifdef LOONGARCH_LASX |
290 | | if (cf.loongarch.has_lasx) { |
291 | | ft.adler32 = &adler32_lasx; |
292 | | ft.adler32_fold_copy = &adler32_fold_copy_lasx; |
293 | | ft.slide_hash = slide_hash_lasx; |
294 | | # ifdef HAVE_BUILTIN_CTZ |
295 | | ft.compare256 = &compare256_lasx; |
296 | | ft.longest_match = &longest_match_lasx; |
297 | | ft.longest_match_slow = &longest_match_slow_lasx; |
298 | | # endif |
299 | | ft.chunkmemset_safe = &chunkmemset_safe_lasx; |
300 | | ft.inflate_fast = &inflate_fast_lasx; |
301 | | } |
302 | | #endif |
303 | | |
304 | | // Assign function pointers individually for atomic operation |
305 | 1 | FUNCTABLE_ASSIGN(ft, force_init); |
306 | 1 | FUNCTABLE_ASSIGN(ft, adler32); |
307 | 1 | FUNCTABLE_ASSIGN(ft, adler32_fold_copy); |
308 | 1 | FUNCTABLE_ASSIGN(ft, chunkmemset_safe); |
309 | 1 | FUNCTABLE_ASSIGN(ft, compare256); |
310 | 1 | FUNCTABLE_ASSIGN(ft, crc32); |
311 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold); |
312 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold_copy); |
313 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold_final); |
314 | 1 | FUNCTABLE_ASSIGN(ft, crc32_fold_reset); |
315 | 1 | FUNCTABLE_ASSIGN(ft, inflate_fast); |
316 | 1 | FUNCTABLE_ASSIGN(ft, longest_match); |
317 | 1 | FUNCTABLE_ASSIGN(ft, longest_match_slow); |
318 | 1 | FUNCTABLE_ASSIGN(ft, slide_hash); |
319 | | |
320 | | // Memory barrier for weak memory order CPUs |
321 | 1 | FUNCTABLE_BARRIER(); |
322 | 1 | } |
323 | | |
324 | | /* stub functions */ |
325 | 1 | static void force_init_stub(void) { |
326 | 1 | init_functable(); |
327 | 1 | } |
328 | | |
329 | 0 | static uint32_t adler32_stub(uint32_t adler, const uint8_t* buf, size_t len) { |
330 | 0 | init_functable(); |
331 | 0 | return functable.adler32(adler, buf, len); |
332 | 0 | } |
333 | | |
334 | 0 | static uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t* dst, const uint8_t* src, size_t len) { |
335 | 0 | init_functable(); |
336 | 0 | return functable.adler32_fold_copy(adler, dst, src, len); |
337 | 0 | } |
338 | | |
339 | 0 | static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) { |
340 | 0 | init_functable(); |
341 | 0 | return functable.chunkmemset_safe(out, from, len, left); |
342 | 0 | } |
343 | | |
344 | 0 | static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { |
345 | 0 | init_functable(); |
346 | 0 | return functable.compare256(src0, src1); |
347 | 0 | } |
348 | | |
349 | 0 | static uint32_t crc32_stub(uint32_t crc, const uint8_t* buf, size_t len) { |
350 | 0 | init_functable(); |
351 | 0 | return functable.crc32(crc, buf, len); |
352 | 0 | } |
353 | | |
354 | 0 | static void crc32_fold_stub(crc32_fold* crc, const uint8_t* src, size_t len, uint32_t init_crc) { |
355 | 0 | init_functable(); |
356 | 0 | functable.crc32_fold(crc, src, len, init_crc); |
357 | 0 | } |
358 | | |
359 | 0 | static void crc32_fold_copy_stub(crc32_fold* crc, uint8_t* dst, const uint8_t* src, size_t len) { |
360 | 0 | init_functable(); |
361 | 0 | functable.crc32_fold_copy(crc, dst, src, len); |
362 | 0 | } |
363 | | |
364 | 0 | static uint32_t crc32_fold_final_stub(crc32_fold* crc) { |
365 | 0 | init_functable(); |
366 | 0 | return functable.crc32_fold_final(crc); |
367 | 0 | } |
368 | | |
369 | 0 | static uint32_t crc32_fold_reset_stub(crc32_fold* crc) { |
370 | 0 | init_functable(); |
371 | 0 | return functable.crc32_fold_reset(crc); |
372 | 0 | } |
373 | | |
374 | 0 | static void inflate_fast_stub(PREFIX3(stream) *strm, uint32_t start) { |
375 | 0 | init_functable(); |
376 | 0 | functable.inflate_fast(strm, start); |
377 | 0 | } |
378 | | |
379 | 0 | static uint32_t longest_match_stub(deflate_state* const s, Pos cur_match) { |
380 | 0 | init_functable(); |
381 | 0 | return functable.longest_match(s, cur_match); |
382 | 0 | } |
383 | | |
384 | 0 | static uint32_t longest_match_slow_stub(deflate_state* const s, Pos cur_match) { |
385 | 0 | init_functable(); |
386 | 0 | return functable.longest_match_slow(s, cur_match); |
387 | 0 | } |
388 | | |
389 | 0 | static void slide_hash_stub(deflate_state* s) { |
390 | 0 | init_functable(); |
391 | 0 | functable.slide_hash(s); |
392 | 0 | } |
393 | | |
394 | | /* functable init */ |
395 | | Z_INTERNAL struct functable_s functable = { |
396 | | force_init_stub, |
397 | | adler32_stub, |
398 | | adler32_fold_copy_stub, |
399 | | chunkmemset_safe_stub, |
400 | | compare256_stub, |
401 | | crc32_stub, |
402 | | crc32_fold_stub, |
403 | | crc32_fold_copy_stub, |
404 | | crc32_fold_final_stub, |
405 | | crc32_fold_reset_stub, |
406 | | inflate_fast_stub, |
407 | | longest_match_stub, |
408 | | longest_match_slow_stub, |
409 | | slide_hash_stub, |
410 | | }; |
411 | | |
412 | | #endif |