/src/zlib-ng/fallback_builtins.h
Line | Count | Source |
1 | | #ifndef FALLBACK_BUILTINS_H |
2 | | #define FALLBACK_BUILTINS_H |
3 | | |
4 | | #if defined(_MSC_VER) && !defined(__clang__) |
5 | | # include <intrin.h> |
6 | | #endif |
7 | | |
8 | | /* Provide fallback for compilers that don't support __has_builtin */ |
9 | | #ifndef __has_builtin |
10 | | # define __has_builtin(x) 0 |
11 | | #endif |
12 | | |
13 | | /* Count trailing zeros (CTZ) functions with portable fallback. |
14 | | * |
15 | | * Predicate: Input must be non-zero. The result is undefined for zero input because |
16 | | * __builtin_ctz, BSF, and TZCNT all have undefined/different behavior for zero. TZCNT |
17 | | * returns operand size for zero, BSF leaves destination undefined, and __builtin_ctz |
18 | | * is explicitly undefined per GCC/Clang docs. */ |
19 | | |
20 | 22.6M | Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { |
21 | 22.6M | Assert(value != 0, "Invalid input value: 0"); |
22 | 22.6M | #if __has_builtin(__builtin_ctz) |
23 | 22.6M | return (uint32_t)__builtin_ctz(value); |
24 | | #elif defined(_MSC_VER) && !defined(__clang__) |
25 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) |
26 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ |
27 | | return (uint32_t)_tzcnt_u32(value); |
28 | | # else |
29 | | unsigned long trailing_zero; |
30 | | _BitScanForward(&trailing_zero, value); |
31 | | return (uint32_t)trailing_zero; |
32 | | # endif |
33 | | #else |
34 | | /* De Bruijn CTZ for 32-bit values */ |
35 | | static const uint8_t debruijn_ctz32[32] = { |
36 | | 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, |
37 | | 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 |
38 | | }; |
39 | | uint32_t lsb = value & (~value + 1u); |
40 | | return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; |
41 | | #endif |
42 | 22.6M | } Line | Count | Source | 20 | 22.9k | Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { | 21 | 22.9k | Assert(value != 0, "Invalid input value: 0"); | 22 | 22.9k | #if __has_builtin(__builtin_ctz) | 23 | 22.9k | return (uint32_t)__builtin_ctz(value); | 24 | | #elif defined(_MSC_VER) && !defined(__clang__) | 25 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) | 26 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ | 27 | | return (uint32_t)_tzcnt_u32(value); | 28 | | # else | 29 | | unsigned long trailing_zero; | 30 | | _BitScanForward(&trailing_zero, value); | 31 | | return (uint32_t)trailing_zero; | 32 | | # endif | 33 | | #else | 34 | | /* De Bruijn CTZ for 32-bit values */ | 35 | | static const uint8_t debruijn_ctz32[32] = { | 36 | | 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, | 37 | | 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 | 38 | | }; | 39 | | uint32_t lsb = value & (~value + 1u); | 40 | | return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; | 41 | | #endif | 42 | 22.9k | } |
Unexecuted instantiation: deflate_fast.c:zng_ctz32 Unexecuted instantiation: deflate_huff.c:zng_ctz32 Unexecuted instantiation: deflate_medium.c:zng_ctz32 Unexecuted instantiation: deflate_quick.c:zng_ctz32 Unexecuted instantiation: deflate_rle.c:zng_ctz32 Unexecuted instantiation: deflate_slow.c:zng_ctz32 Unexecuted instantiation: deflate_stored.c:zng_ctz32 Unexecuted instantiation: functable.c:zng_ctz32 Unexecuted instantiation: trees.c:zng_ctz32 Unexecuted instantiation: compare256_sse2.c:zng_ctz32 Unexecuted instantiation: crc32_chorba_sse2.c:zng_ctz32 Unexecuted instantiation: crc32_chorba_sse41.c:zng_ctz32 compare256_avx2.c:zng_ctz32 Line | Count | Source | 20 | 22.5M | Z_FORCEINLINE static uint32_t zng_ctz32(uint32_t value) { | 21 | 22.5M | Assert(value != 0, "Invalid input value: 0"); | 22 | 22.5M | #if __has_builtin(__builtin_ctz) | 23 | 22.5M | return (uint32_t)__builtin_ctz(value); | 24 | | #elif defined(_MSC_VER) && !defined(__clang__) | 25 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) | 26 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ | 27 | | return (uint32_t)_tzcnt_u32(value); | 28 | | # else | 29 | | unsigned long trailing_zero; | 30 | | _BitScanForward(&trailing_zero, value); | 31 | | return (uint32_t)trailing_zero; | 32 | | # endif | 33 | | #else | 34 | | /* De Bruijn CTZ for 32-bit values */ | 35 | | static const uint8_t debruijn_ctz32[32] = { | 36 | | 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, | 37 | | 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 | 38 | | }; | 39 | | uint32_t lsb = value & (~value + 1u); | 40 | | return debruijn_ctz32[(lsb * 0x077CB531U) >> 27]; | 41 | | #endif | 42 | 22.5M | } |
Unexecuted instantiation: adler32_avx512.c:zng_ctz32 Unexecuted instantiation: compare256_avx512.c:zng_ctz32 Unexecuted instantiation: adler32_avx512_vnni.c:zng_ctz32 Unexecuted instantiation: adler32_c.c:zng_ctz32 Unexecuted instantiation: crc32_braid_c.c:zng_ctz32 Unexecuted instantiation: crc32_chorba_c.c:zng_ctz32 Unexecuted instantiation: inflate.c:zng_ctz32 Unexecuted instantiation: inftrees.c:zng_ctz32 |
43 | | |
44 | 0 | Z_FORCEINLINE static uint32_t zng_ctz64(uint64_t value) { |
45 | 0 | Assert(value != 0, "Invalid input value: 0"); |
46 | 0 | #if __has_builtin(__builtin_ctzll) |
47 | 0 | return (uint32_t)__builtin_ctzll(value); |
48 | | #elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT) |
49 | | # if defined(X86_FEATURES) && !(_MSC_VER < 1700) |
50 | | /* tzcnt falls back to bsf on cpus without BMI1, and is equal or faster on all x86 cpus. */ |
51 | | return (uint32_t)_tzcnt_u64(value); |
52 | | # else |
53 | | unsigned long trailing_zero; |
54 | | _BitScanForward64(&trailing_zero, value); |
55 | | return (uint32_t)trailing_zero; |
56 | | # endif |
57 | | #else |
58 | | /* De Bruijn CTZ for 64-bit values */ |
59 | | static const uint8_t debruijn_ctz64[64] = { |
60 | | 63, 0, 1, 52, 2, 6, 53, 26, 3, 37, 40, 7, 33, 54, 47, 27, |
61 | | 61, 4, 38, 45, 43, 41, 21, 8, 23, 34, 58, 55, 48, 17, 28, 10, |
62 | | 62, 51, 5, 25, 36, 39, 32, 46, 60, 44, 42, 20, 22, 57, 16, 9, |
63 | | 50, 24, 35, 31, 59, 19, 56, 15, 49, 30, 18, 14, 29, 13, 12, 11 |
64 | | }; |
65 | | uint64_t lsb = value & (~value + 1ull); |
66 | | return debruijn_ctz64[(lsb * 0x045FBAC7992A70DAULL) >> 58]; |
67 | | #endif |
68 | 0 | } Unexecuted instantiation: deflate.c:zng_ctz64 Unexecuted instantiation: deflate_fast.c:zng_ctz64 Unexecuted instantiation: deflate_huff.c:zng_ctz64 Unexecuted instantiation: deflate_medium.c:zng_ctz64 Unexecuted instantiation: deflate_quick.c:zng_ctz64 Unexecuted instantiation: deflate_rle.c:zng_ctz64 Unexecuted instantiation: deflate_slow.c:zng_ctz64 Unexecuted instantiation: deflate_stored.c:zng_ctz64 Unexecuted instantiation: functable.c:zng_ctz64 Unexecuted instantiation: trees.c:zng_ctz64 Unexecuted instantiation: compare256_sse2.c:zng_ctz64 Unexecuted instantiation: crc32_chorba_sse2.c:zng_ctz64 Unexecuted instantiation: crc32_chorba_sse41.c:zng_ctz64 Unexecuted instantiation: compare256_avx2.c:zng_ctz64 Unexecuted instantiation: adler32_avx512.c:zng_ctz64 Unexecuted instantiation: compare256_avx512.c:zng_ctz64 Unexecuted instantiation: adler32_avx512_vnni.c:zng_ctz64 Unexecuted instantiation: adler32_c.c:zng_ctz64 Unexecuted instantiation: crc32_braid_c.c:zng_ctz64 Unexecuted instantiation: crc32_chorba_c.c:zng_ctz64 Unexecuted instantiation: inflate.c:zng_ctz64 Unexecuted instantiation: inftrees.c:zng_ctz64 |
69 | | |
70 | | /* Count leading zeros (CLZ) functions with portable fallback. |
71 | | * |
72 | | * Predicate: Input must be non-zero. The result is undefined for zero input because |
73 | | * __builtin_clz, BSR, and LZCNT all have undefined/different behavior for zero. LZCNT |
74 | | * returns operand size for zero, BSR leaves destination undefined, and __builtin_clz |
75 | | * is explicitly undefined per GCC/Clang docs. */ |
76 | | |
77 | 0 | Z_FORCEINLINE static uint32_t zng_clz32(uint32_t value) { |
78 | 0 | Assert(value != 0, "Invalid input value: 0"); |
79 | 0 | #if __has_builtin(__builtin_clz) |
80 | 0 | return (uint32_t)__builtin_clz(value); |
81 | 0 | #elif defined(_MSC_VER) && !defined(__clang__) |
82 | 0 | unsigned long leading_zero; |
83 | 0 | _BitScanReverse(&leading_zero, value); |
84 | 0 | return 31u - (uint32_t)leading_zero; |
85 | 0 | #else |
86 | 0 | /* Smear the highest set bit down, isolate it, then reuse de Bruijn CTZ */ |
87 | 0 | value |= value >> 1; |
88 | 0 | value |= value >> 2; |
89 | 0 | value |= value >> 4; |
90 | 0 | value |= value >> 8; |
91 | 0 | value |= value >> 16; |
92 | 0 | return 31u - zng_ctz32((value >> 1) + 1u); |
93 | 0 | #endif |
94 | 0 | } Unexecuted instantiation: deflate.c:zng_clz32 Unexecuted instantiation: deflate_fast.c:zng_clz32 Unexecuted instantiation: deflate_huff.c:zng_clz32 Unexecuted instantiation: deflate_medium.c:zng_clz32 Unexecuted instantiation: deflate_quick.c:zng_clz32 Unexecuted instantiation: deflate_rle.c:zng_clz32 Unexecuted instantiation: deflate_slow.c:zng_clz32 Unexecuted instantiation: deflate_stored.c:zng_clz32 Unexecuted instantiation: functable.c:zng_clz32 Unexecuted instantiation: trees.c:zng_clz32 Unexecuted instantiation: compare256_sse2.c:zng_clz32 Unexecuted instantiation: crc32_chorba_sse2.c:zng_clz32 Unexecuted instantiation: crc32_chorba_sse41.c:zng_clz32 Unexecuted instantiation: compare256_avx2.c:zng_clz32 Unexecuted instantiation: adler32_avx512.c:zng_clz32 Unexecuted instantiation: compare256_avx512.c:zng_clz32 Unexecuted instantiation: adler32_avx512_vnni.c:zng_clz32 Unexecuted instantiation: adler32_c.c:zng_clz32 Unexecuted instantiation: crc32_braid_c.c:zng_clz32 Unexecuted instantiation: crc32_chorba_c.c:zng_clz32 Unexecuted instantiation: inflate.c:zng_clz32 Unexecuted instantiation: inftrees.c:zng_clz32 |
95 | | |
96 | 0 | Z_FORCEINLINE static uint32_t zng_clz64(uint64_t value) { |
97 | 0 | Assert(value != 0, "Invalid input value: 0"); |
98 | 0 | #if __has_builtin(__builtin_clzll) |
99 | 0 | return (uint32_t)__builtin_clzll(value); |
100 | 0 | #elif defined(_MSC_VER) && !defined(__clang__) && defined(ARCH_64BIT) |
101 | 0 | unsigned long leading_zero; |
102 | 0 | _BitScanReverse64(&leading_zero, value); |
103 | 0 | return 63u - (uint32_t)leading_zero; |
104 | 0 | #elif defined(_MSC_VER) && !defined(__clang__) |
105 | 0 | /* 32-bit MSVC fallback using two 32-bit scans */ |
106 | 0 | unsigned long leading_zero; |
107 | 0 | if (_BitScanReverse(&leading_zero, (uint32_t)(value >> 32))) |
108 | 0 | return 31u - (uint32_t)leading_zero; |
109 | 0 | _BitScanReverse(&leading_zero, (uint32_t)value); |
110 | 0 | return 63u - (uint32_t)leading_zero; |
111 | 0 | #else |
112 | 0 | /* Smear the highest set bit down, isolate it, then reuse de Bruijn CTZ */ |
113 | 0 | value |= value >> 1; |
114 | 0 | value |= value >> 2; |
115 | 0 | value |= value >> 4; |
116 | 0 | value |= value >> 8; |
117 | 0 | value |= value >> 16; |
118 | 0 | value |= value >> 32; |
119 | 0 | return 63u - zng_ctz64((value >> 1) + 1ull); |
120 | 0 | #endif |
121 | 0 | } Unexecuted instantiation: deflate.c:zng_clz64 Unexecuted instantiation: deflate_fast.c:zng_clz64 Unexecuted instantiation: deflate_huff.c:zng_clz64 Unexecuted instantiation: deflate_medium.c:zng_clz64 Unexecuted instantiation: deflate_quick.c:zng_clz64 Unexecuted instantiation: deflate_rle.c:zng_clz64 Unexecuted instantiation: deflate_slow.c:zng_clz64 Unexecuted instantiation: deflate_stored.c:zng_clz64 Unexecuted instantiation: functable.c:zng_clz64 Unexecuted instantiation: trees.c:zng_clz64 Unexecuted instantiation: compare256_sse2.c:zng_clz64 Unexecuted instantiation: crc32_chorba_sse2.c:zng_clz64 Unexecuted instantiation: crc32_chorba_sse41.c:zng_clz64 Unexecuted instantiation: compare256_avx2.c:zng_clz64 Unexecuted instantiation: adler32_avx512.c:zng_clz64 Unexecuted instantiation: compare256_avx512.c:zng_clz64 Unexecuted instantiation: adler32_avx512_vnni.c:zng_clz64 Unexecuted instantiation: adler32_c.c:zng_clz64 Unexecuted instantiation: crc32_braid_c.c:zng_clz64 Unexecuted instantiation: crc32_chorba_c.c:zng_clz64 Unexecuted instantiation: inflate.c:zng_clz64 Unexecuted instantiation: inftrees.c:zng_clz64 |
122 | | |
123 | | /* Byte-position of the first differing byte in a native-endian XOR diff, |
124 | | * using CTZ on little-endian and CLZ on big-endian to avoid a byte-swap. */ |
125 | | #if BYTE_ORDER == BIG_ENDIAN |
126 | | # define zng_first_diff_byte32(diff) (zng_clz32(diff) / 8) |
127 | | # define zng_first_diff_byte64(diff) (zng_clz64(diff) / 8) |
128 | | #else |
129 | | # define zng_first_diff_byte32(diff) (zng_ctz32(diff) / 8) |
130 | 0 | # define zng_first_diff_byte64(diff) (zng_ctz64(diff) / 8) |
131 | | #endif |
132 | | |
133 | 11.5M | Z_FORCEINLINE static uint16_t zng_bitreverse16(uint16_t value) { |
134 | 11.5M | #if __has_builtin(__builtin_bitreverse16) |
135 | 11.5M | return (uint16_t)__builtin_bitreverse16(value); |
136 | | #elif defined(ARCH_ARM) && defined(ARCH_64BIT) && !defined(_MSC_VER) |
137 | | /* ARM bit reversal for 16-bit values using rbit instruction */ |
138 | | uint32_t res; |
139 | | # if __has_builtin(__builtin_rbit) |
140 | | res = __builtin_rbit((uint32_t)value); |
141 | | # else |
142 | | __asm__ volatile("rbit %w0, %w1" : "=r"(res) : "r"((uint32_t)value)); |
143 | | # endif |
144 | | return (uint16_t)(res >> 16); |
145 | | #elif defined(ARCH_LOONGARCH) |
146 | | /* LoongArch bit reversal for 16-bit values */ |
147 | | uint32_t res; |
148 | | __asm__ volatile("bitrev.w %0, %1" : "=r"(res) : "r"(value)); |
149 | | return (uint16_t)(res >> 16); |
150 | | #else |
151 | | /* Bit reversal for 8-bit values using multiplication method */ |
152 | | # define bitrev8(value) \ |
153 | | (uint8_t)((((uint8_t)(value) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) |
154 | | /* General purpose bit reversal for 16-bit values */ |
155 | | return ((bitrev8(value >> 8) | (uint16_t)bitrev8(value) << 8)); |
156 | | # undef bitrev8 |
157 | | #endif |
158 | 11.5M | } Unexecuted instantiation: deflate.c:zng_bitreverse16 Unexecuted instantiation: deflate_fast.c:zng_bitreverse16 Unexecuted instantiation: deflate_huff.c:zng_bitreverse16 Unexecuted instantiation: deflate_medium.c:zng_bitreverse16 Unexecuted instantiation: deflate_quick.c:zng_bitreverse16 Unexecuted instantiation: deflate_rle.c:zng_bitreverse16 Unexecuted instantiation: deflate_slow.c:zng_bitreverse16 Unexecuted instantiation: deflate_stored.c:zng_bitreverse16 Unexecuted instantiation: functable.c:zng_bitreverse16 Line | Count | Source | 133 | 6.70M | Z_FORCEINLINE static uint16_t zng_bitreverse16(uint16_t value) { | 134 | 6.70M | #if __has_builtin(__builtin_bitreverse16) | 135 | 6.70M | return (uint16_t)__builtin_bitreverse16(value); | 136 | | #elif defined(ARCH_ARM) && defined(ARCH_64BIT) && !defined(_MSC_VER) | 137 | | /* ARM bit reversal for 16-bit values using rbit instruction */ | 138 | | uint32_t res; | 139 | | # if __has_builtin(__builtin_rbit) | 140 | | res = __builtin_rbit((uint32_t)value); | 141 | | # else | 142 | | __asm__ volatile("rbit %w0, %w1" : "=r"(res) : "r"((uint32_t)value)); | 143 | | # endif | 144 | | return (uint16_t)(res >> 16); | 145 | | #elif defined(ARCH_LOONGARCH) | 146 | | /* LoongArch bit reversal for 16-bit values */ | 147 | | uint32_t res; | 148 | | __asm__ volatile("bitrev.w %0, %1" : "=r"(res) : "r"(value)); | 149 | | return (uint16_t)(res >> 16); | 150 | | #else | 151 | | /* Bit reversal for 8-bit values using multiplication method */ | 152 | | # define bitrev8(value) \ | 153 | | (uint8_t)((((uint8_t)(value) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) | 154 | | /* General purpose bit reversal for 16-bit values */ | 155 | | return ((bitrev8(value >> 8) | (uint16_t)bitrev8(value) << 8)); | 156 | | # undef bitrev8 | 157 | | #endif | 158 | 6.70M | } |
Unexecuted instantiation: compare256_sse2.c:zng_bitreverse16 Unexecuted instantiation: crc32_chorba_sse2.c:zng_bitreverse16 Unexecuted instantiation: crc32_chorba_sse41.c:zng_bitreverse16 Unexecuted instantiation: compare256_avx2.c:zng_bitreverse16 Unexecuted instantiation: adler32_avx512.c:zng_bitreverse16 Unexecuted instantiation: compare256_avx512.c:zng_bitreverse16 Unexecuted instantiation: adler32_avx512_vnni.c:zng_bitreverse16 Unexecuted instantiation: adler32_c.c:zng_bitreverse16 Unexecuted instantiation: crc32_braid_c.c:zng_bitreverse16 Unexecuted instantiation: crc32_chorba_c.c:zng_bitreverse16 Unexecuted instantiation: inflate.c:zng_bitreverse16 inftrees.c:zng_bitreverse16 Line | Count | Source | 133 | 4.83M | Z_FORCEINLINE static uint16_t zng_bitreverse16(uint16_t value) { | 134 | 4.83M | #if __has_builtin(__builtin_bitreverse16) | 135 | 4.83M | return (uint16_t)__builtin_bitreverse16(value); | 136 | | #elif defined(ARCH_ARM) && defined(ARCH_64BIT) && !defined(_MSC_VER) | 137 | | /* ARM bit reversal for 16-bit values using rbit instruction */ | 138 | | uint32_t res; | 139 | | # if __has_builtin(__builtin_rbit) | 140 | | res = __builtin_rbit((uint32_t)value); | 141 | | # else | 142 | | __asm__ volatile("rbit %w0, %w1" : "=r"(res) : "r"((uint32_t)value)); | 143 | | # endif | 144 | | return (uint16_t)(res >> 16); | 145 | | #elif defined(ARCH_LOONGARCH) | 146 | | /* LoongArch bit reversal for 16-bit values */ | 147 | | uint32_t res; | 148 | | __asm__ volatile("bitrev.w %0, %1" : "=r"(res) : "r"(value)); | 149 | | return (uint16_t)(res >> 16); | 150 | | #else | 151 | | /* Bit reversal for 8-bit values using multiplication method */ | 152 | | # define bitrev8(value) \ | 153 | | (uint8_t)((((uint8_t)(value) * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32) | 154 | | /* General purpose bit reversal for 16-bit values */ | 155 | | return ((bitrev8(value >> 8) | (uint16_t)bitrev8(value) << 8)); | 156 | | # undef bitrev8 | 157 | | #endif | 158 | 4.83M | } |
|
159 | | |
160 | | #endif // include guard FALLBACK_BUILTINS_H |