/src/zlib-ng/fallback_builtins.h
Line | Count | Source |
1 | | #ifndef FALLBACK_BUILTINS_H |
2 | | #define FALLBACK_BUILTINS_H |
3 | | |
4 | | #if defined(_MSC_VER) && !defined(__clang__) |
5 | | # include <intrin.h> |
6 | | #endif |
7 | | |
8 | | /* Provide fallback for compilers that don't support __has_builtin */ |
9 | | #ifndef __has_builtin |
10 | | # define __has_builtin(x) 0 |
11 | | #endif |
12 | | |
13 | | /* Count trailing zeros (CTZ) functions with portable fallback. |
14 | | * |
15 | | * Predicate: Input must be non-zero. The result is undefined for zero input because |
16 | | * __builtin_ctz, BSF, and TZCNT all have undefined/different behavior for zero. TZCNT |
17 | | * returns operand size for zero, BSF leaves destination undefined, and __builtin_ctz |
18 | | * is explicitly undefined per GCC/Clang docs. */ |
19 | | |
20 | 2.41M | Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { |
21 | 2.41M | Assert(value != 0, "Invalid input value: 0"); |
22 | 2.41M | #if __has_builtin(__builtin_ctz) |
23 | 2.41M | return (uint32_t)__builtin_ctz(value); |
24 | | #elif defined(_MSC_VER) && !defined(__clang__) |
25 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) |
26 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ |
27 | | return (uint32_t)_tzcnt_u32(value); |
28 | | # else |
29 | | unsigned long trailing_zero; |
30 | | _BitScanForward(&trailing_zero, value); |
31 | | return (uint32_t)trailing_zero; |
32 | | # endif |
33 | | #else |
34 | | /* De Bruijn CTZ for 32-bit values */ |
35 | | static const uint8_t debruijn_ctz32[32] = { |
36 | | 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, |
37 | | 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 |
38 | | }; |
39 | | uint32_t lsb = value & (~value + 1u); |
40 | | return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; |
41 | | #endif |
42 | 2.41M | } Line | Count | Source | 20 | 21.3k | Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { | 21 | 21.3k | Assert(value != 0, "Invalid input value: 0"); | 22 | 21.3k | #if __has_builtin(__builtin_ctz) | 23 | 21.3k | return (uint32_t)__builtin_ctz(value); | 24 | | #elif defined(_MSC_VER) && !defined(__clang__) | 25 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) | 26 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ | 27 | | return (uint32_t)_tzcnt_u32(value); | 28 | | # else | 29 | | unsigned long trailing_zero; | 30 | | _BitScanForward(&trailing_zero, value); | 31 | | return (uint32_t)trailing_zero; | 32 | | # endif | 33 | | #else | 34 | | /* De Bruijn CTZ for 32-bit values */ | 35 | | static const uint8_t debruijn_ctz32[32] = { | 36 | | 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, | 37 | | 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 | 38 | | }; | 39 | | uint32_t lsb = value & (~value + 1u); | 40 | | return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; | 41 | | #endif | 42 | 21.3k | } |
Unexecuted instantiation: deflate_fast.c:zng_ctz32 Unexecuted instantiation: deflate_huff.c:zng_ctz32 Unexecuted instantiation: deflate_medium.c:zng_ctz32 Unexecuted instantiation: deflate_quick.c:zng_ctz32 Unexecuted instantiation: deflate_rle.c:zng_ctz32 Unexecuted instantiation: deflate_slow.c:zng_ctz32 Unexecuted instantiation: deflate_stored.c:zng_ctz32 Unexecuted instantiation: functable.c:zng_ctz32 Unexecuted instantiation: inftrees.c:zng_ctz32 Unexecuted instantiation: trees.c:zng_ctz32 Unexecuted instantiation: compare256_sse2.c:zng_ctz32 Unexecuted instantiation: crc32_chorba_sse2.c:zng_ctz32 Unexecuted instantiation: crc32_chorba_sse41.c:zng_ctz32 compare256_avx2.c:zng_ctz32 Line | Count | Source | 20 | 2.38M | Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { | 21 | 2.38M | Assert(value != 0, "Invalid input value: 0"); | 22 | 2.38M | #if __has_builtin(__builtin_ctz) | 23 | 2.38M | return (uint32_t)__builtin_ctz(value); | 24 | | #elif defined(_MSC_VER) && !defined(__clang__) | 25 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) | 26 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ | 27 | | return (uint32_t)_tzcnt_u32(value); | 28 | | # else | 29 | | unsigned long trailing_zero; | 30 | | _BitScanForward(&trailing_zero, value); | 31 | | return (uint32_t)trailing_zero; | 32 | | # endif | 33 | | #else | 34 | | /* De Bruijn CTZ for 32-bit values */ | 35 | | static const uint8_t debruijn_ctz32[32] = { | 36 | | 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, | 37 | | 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 | 38 | | }; | 39 | | uint32_t lsb = value & (~value + 1u); | 40 | | return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; | 41 | | #endif | 42 | 2.38M | } |
Unexecuted instantiation: adler32_avx512.c:zng_ctz32 Unexecuted instantiation: compare256_avx512.c:zng_ctz32 Unexecuted instantiation: adler32_avx512_vnni.c:zng_ctz32 |
43 | | |
44 | 138k | Z_FORCEINLINE static uint32_t zng_ctz64(uint64_t value) { |
45 | 138k | Assert(value != 0, "Invalid input value: 0"); |
46 | 138k | #if __has_builtin(__builtin_ctzll) |
47 | 138k | return (uint32_t)__builtin_ctzll(value); |
48 | | #elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT) |
49 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) |
50 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ |
51 | | return (uint32_t)_tzcnt_u64(value); |
52 | | # else |
53 | | unsigned long trailing_zero; |
54 | | _BitScanForward64(&trailing_zero, value); |
55 | | return (uint32_t)trailing_zero; |
56 | | # endif |
57 | | #else |
58 | | /* De Bruijn CTZ for 64-bit values */ |
59 | | static const uint8_t debruijn_ctz64[64] = { |
60 | | 63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27, |
61 | | 61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10, |
62 | | 62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9, |
63 | | 50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11 |
64 | | }; |
65 | | uint64_t lsb = value & (~value + 1ull); |
66 | | return debruijn_ctz64[(lsb * 0x045FBAC7992A70DAULL) >> 58]; |
67 | | #endif |
68 | 138k | } Unexecuted instantiation: deflate.c:zng_ctz64 Unexecuted instantiation: deflate_fast.c:zng_ctz64 Unexecuted instantiation: deflate_huff.c:zng_ctz64 Unexecuted instantiation: deflate_medium.c:zng_ctz64 Unexecuted instantiation: deflate_quick.c:zng_ctz64 Line | Count | Source | 44 | 138k | Z_FORCEINLINE static uint32_t zng_ctz64(uint64_t value) { | 45 | 138k | Assert(value != 0, "Invalid input value: 0"); | 46 | 138k | #if __has_builtin(__builtin_ctzll) | 47 | 138k | return (uint32_t)__builtin_ctzll(value); | 48 | | #elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT) | 49 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) | 50 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ | 51 | | return (uint32_t)_tzcnt_u64(value); | 52 | | # else | 53 | | unsigned long trailing_zero; | 54 | | _BitScanForward64(&trailing_zero, value); | 55 | | return (uint32_t)trailing_zero; | 56 | | # endif | 57 | | #else | 58 | | /* De Bruijn CTZ for 64-bit values */ | 59 | | static const uint8_t debruijn_ctz64[64] = { | 60 | | 63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27, | 61 | | 61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10, | 62 | | 62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9, | 63 | | 50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11 | 64 | | }; | 65 | | uint64_t lsb = value & (~value + 1ull); | 66 | | return debruijn_ctz64[(lsb * 0x045FBAC7992A70DAULL) >> 58]; | 67 | | #endif | 68 | 138k | } |
Unexecuted instantiation: deflate_slow.c:zng_ctz64 Unexecuted instantiation: deflate_stored.c:zng_ctz64 Unexecuted instantiation: functable.c:zng_ctz64 Unexecuted instantiation: inftrees.c:zng_ctz64 Unexecuted instantiation: trees.c:zng_ctz64 Unexecuted instantiation: compare256_sse2.c:zng_ctz64 Unexecuted instantiation: crc32_chorba_sse2.c:zng_ctz64 Unexecuted instantiation: crc32_chorba_sse41.c:zng_ctz64 Unexecuted instantiation: compare256_avx2.c:zng_ctz64 Unexecuted instantiation: adler32_avx512.c:zng_ctz64 Unexecuted instantiation: compare256_avx512.c:zng_ctz64 Unexecuted instantiation: adler32_avx512_vnni.c:zng_ctz64 |
69 | | |
70 | 4.42M | Z_FORCEINLINE static uint16_t zng_bitreverse16(uint16_t value) { |
71 | 4.42M | #if __has_builtin(__builtin_bitreverse16) |
72 | 4.42M | return (uint16_t)__builtin_bitreverse16(value); |
73 | | #elif defined(ARCH_ARM) && defined(ARCH_64BIT) && !defined(_MSC_VER) |
74 | | /* ARM bit reversal for 16-bit values using rbit instruction */ |
75 | | uint32_t res; |
76 | | # if __has_builtin(__builtin_rbit) |
77 | | res = __builtin_rbit((uint32_t)value); |
78 | | # else |
79 | | __asm__ volatile("rbit %w0, %w1" : "=r"(res) : "r"((uint32_t)value)); |
80 | | # endif |
81 | | return (uint16_t)(res >> 16); |
82 | | #elif defined(ARCH_LOONGARCH) |
83 | | /* LoongArch bit reversal for 16-bit values */ |
84 | | uint32_t res; |
85 | | __asm__ volatile("bitrev.w %0, %1" : "=r"(res) : "r"(value)); |
86 | | return (uint16_t)(res >> 16); |
87 | | #else |
88 | | /* Bit reversal for 8-bit values using multiplication method */ |
89 | | # define bitrev8(value) \ |
90 | | (uint8_t)((((uint8_t)(value) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) |
91 | | /* General purpose bit reversal for 16-bit values */ |
92 | | return ((bitrev8(value >> 8) | (uint16_t)bitrev8(value) << 8)); |
93 | | # undef bitrev8 |
94 | | #endif |
95 | 4.42M | } Unexecuted instantiation: deflate.c:zng_bitreverse16 Unexecuted instantiation: deflate_fast.c:zng_bitreverse16 Unexecuted instantiation: deflate_huff.c:zng_bitreverse16 Unexecuted instantiation: deflate_medium.c:zng_bitreverse16 Unexecuted instantiation: deflate_quick.c:zng_bitreverse16 Unexecuted instantiation: deflate_rle.c:zng_bitreverse16 Unexecuted instantiation: deflate_slow.c:zng_bitreverse16 Unexecuted instantiation: deflate_stored.c:zng_bitreverse16 Unexecuted instantiation: functable.c:zng_bitreverse16 inftrees.c:zng_bitreverse16 Line | Count | Source | 70 | 1.34M | Z_FORCEINLINE static uint16_t zng_bitreverse16(uint16_t value) { | 71 | 1.34M | #if __has_builtin(__builtin_bitreverse16) | 72 | 1.34M | return (uint16_t)__builtin_bitreverse16(value); | 73 | | #elif defined(ARCH_ARM) && defined(ARCH_64BIT) && !defined(_MSC_VER) | 74 | | /* ARM bit reversal for 16-bit values using rbit instruction */ | 75 | | uint32_t res; | 76 | | # if __has_builtin(__builtin_rbit) | 77 | | res = __builtin_rbit((uint32_t)value); | 78 | | # else | 79 | | __asm__ volatile("rbit %w0, %w1" : "=r"(res) : "r"((uint32_t)value)); | 80 | | # endif | 81 | | return (uint16_t)(res >> 16); | 82 | | #elif defined(ARCH_LOONGARCH) | 83 | | /* LoongArch bit reversal for 16-bit values */ | 84 | | uint32_t res; | 85 | | __asm__ volatile("bitrev.w %0, %1" : "=r"(res) : "r"(value)); | 86 | | return (uint16_t)(res >> 16); | 87 | | #else | 88 | | /* Bit reversal for 8-bit values using multiplication method */ | 89 | | # define bitrev8(value) \ | 90 | | (uint8_t)((((uint8_t)(value) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) | 91 | | /* General purpose bit reversal for 16-bit values */ | 92 | | return ((bitrev8(value >> 8) | (uint16_t)bitrev8(value) << 8)); | 93 | | # undef bitrev8 | 94 | | #endif | 95 | 1.34M | } |
Line | Count | Source | 70 | 3.08M | Z_FORCEINLINE static uint16_t zng_bitreverse16(uint16_t value) { | 71 | 3.08M | #if __has_builtin(__builtin_bitreverse16) | 72 | 3.08M | return (uint16_t)__builtin_bitreverse16(value); | 73 | | #elif defined(ARCH_ARM) && defined(ARCH_64BIT) && !defined(_MSC_VER) | 74 | | /* ARM bit reversal for 16-bit values using rbit instruction */ | 75 | | uint32_t res; | 76 | | # if __has_builtin(__builtin_rbit) | 77 | | res = __builtin_rbit((uint32_t)value); | 78 | | # else | 79 | | __asm__ volatile("rbit %w0, %w1" : "=r"(res) : "r"((uint32_t)value)); | 80 | | # endif | 81 | | return (uint16_t)(res >> 16); | 82 | | #elif defined(ARCH_LOONGARCH) | 83 | | /* LoongArch bit reversal for 16-bit values */ | 84 | | uint32_t res; | 85 | | __asm__ volatile("bitrev.w %0, %1" : "=r"(res) : "r"(value)); | 86 | | return (uint16_t)(res >> 16); | 87 | | #else | 88 | | /* Bit reversal for 8-bit values using multiplication method */ | 89 | | # define bitrev8(value) \ | 90 | | (uint8_t)((((uint8_t)(value) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) | 91 | | /* General purpose bit reversal for 16-bit values */ | 92 | | return ((bitrev8(value >> 8) | (uint16_t)bitrev8(value) << 8)); | 93 | | # undef bitrev8 | 94 | | #endif | 95 | 3.08M | } |
Unexecuted instantiation: compare256_sse2.c:zng_bitreverse16 Unexecuted instantiation: crc32_chorba_sse2.c:zng_bitreverse16 Unexecuted instantiation: crc32_chorba_sse41.c:zng_bitreverse16 Unexecuted instantiation: compare256_avx2.c:zng_bitreverse16 Unexecuted instantiation: adler32_avx512.c:zng_bitreverse16 Unexecuted instantiation: compare256_avx512.c:zng_bitreverse16 Unexecuted instantiation: adler32_avx512_vnni.c:zng_bitreverse16 |
96 | | |
97 | | #endif // include guard FALLBACK_BUILTINS_H |