/src/libdeflate/lib/x86/cpu_features.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * x86/cpu_features.c - feature detection for x86 CPUs |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | */ |
27 | | |
28 | | #include "../cpu_features_common.h" /* must be included first */ |
29 | | #include "cpu_features.h" |
30 | | |
31 | | #ifdef X86_CPU_FEATURES_KNOWN |
32 | | /* Runtime x86 CPU feature detection is supported. */ |
33 | | |
34 | | /* Execute the CPUID instruction. */ |
35 | | static inline void |
36 | | cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) |
37 | 24 | { |
38 | | #ifdef _MSC_VER |
39 | | int result[4]; |
40 | | |
41 | | __cpuidex(result, leaf, subleaf); |
42 | | *a = result[0]; |
43 | | *b = result[1]; |
44 | | *c = result[2]; |
45 | | *d = result[3]; |
46 | | #else |
47 | 24 | __asm__ volatile("cpuid" : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) |
48 | 24 | : "a" (leaf), "c" (subleaf)); |
49 | 24 | #endif |
50 | 24 | } |
51 | | |
52 | | /* Read an extended control register. */ |
53 | | static inline u64 |
54 | | read_xcr(u32 index) |
55 | 6 | { |
56 | | #ifdef _MSC_VER |
57 | | return _xgetbv(index); |
58 | | #else |
59 | 6 | u32 d, a; |
60 | | |
61 | | /* |
62 | | * Execute the "xgetbv" instruction. Old versions of binutils do not |
63 | | * recognize this instruction, so list the raw bytes instead. |
64 | | * |
65 | | * This must be 'volatile' to prevent this code from being moved out |
66 | | * from under the check for OSXSAVE. |
67 | | */ |
68 | 6 | __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : |
69 | 6 | "=d" (d), "=a" (a) : "c" (index)); |
70 | | |
71 | 6 | return ((u64)d << 32) | a; |
72 | 6 | #endif |
73 | 6 | } |
74 | | |
75 | | static const struct cpu_feature x86_cpu_feature_table[] = { |
76 | | {X86_CPU_FEATURE_SSE2, "sse2"}, |
77 | | {X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"}, |
78 | | {X86_CPU_FEATURE_AVX, "avx"}, |
79 | | {X86_CPU_FEATURE_AVX2, "avx2"}, |
80 | | {X86_CPU_FEATURE_BMI2, "bmi2"}, |
81 | | {X86_CPU_FEATURE_ZMM, "zmm"}, |
82 | | {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, |
83 | | {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, |
84 | | {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, |
85 | | {X86_CPU_FEATURE_AVX512VNNI, "avx512_vnni"}, |
86 | | {X86_CPU_FEATURE_AVXVNNI, "avx_vnni"}, |
87 | | }; |
88 | | |
89 | | volatile u32 libdeflate_x86_cpu_features = 0; |
90 | | |
91 | | static inline bool |
92 | | os_supports_avx512(u64 xcr0) |
93 | 6 | { |
94 | | #ifdef __APPLE__ |
95 | | /* |
96 | | * The Darwin kernel had a bug where it could corrupt the opmask |
97 | | * registers. See |
98 | | * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259 |
99 | | * Darwin also does not initially set the XCR0 bits for AVX512, but they |
100 | | * are set if the thread tries to use AVX512 anyway. Thus, to safely |
101 | | * and consistently use AVX512 on macOS we'd need to check the kernel |
102 | | * version as well as detect AVX512 support using a macOS-specific |
103 | | * method. We don't bother with this, especially given Apple's |
104 | | * transition to arm64. |
105 | | */ |
106 | | return false; |
107 | | #else |
108 | 6 | return (xcr0 & 0xe6) == 0xe6; |
109 | 6 | #endif |
110 | 6 | } |
111 | | |
112 | | /* |
113 | | * Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake |
114 | | * and Sapphire Rapids, due to the overly-eager downclocking which can reduce |
115 | | * the performance of workloads that use ZMM registers only occasionally. |
116 | | */ |
117 | | static inline bool |
118 | | allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model) |
119 | 0 | { |
120 | | #ifdef TEST_SUPPORT__DO_NOT_USE |
121 | | return true; |
122 | | #endif |
123 | 0 | if (memcmp(manufacturer, "GenuineIntel", 12) != 0) |
124 | 0 | return true; |
125 | 0 | if (family != 6) |
126 | 0 | return true; |
127 | 0 | switch (model) { |
128 | 0 | case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */ |
129 | 0 | case 106: /* Ice Lake (Server) */ |
130 | 0 | case 108: /* Ice Lake (Server) */ |
131 | 0 | case 126: /* Ice Lake (Client) */ |
132 | 0 | case 140: /* Tiger Lake */ |
133 | 0 | case 141: /* Tiger Lake */ |
134 | 0 | return false; |
135 | 0 | } |
136 | 0 | return true; |
137 | 0 | } |
138 | | |
139 | | /* Initialize libdeflate_x86_cpu_features. */ |
140 | | void libdeflate_init_x86_cpu_features(void) |
141 | 6 | { |
142 | 6 | u32 max_leaf; |
143 | 6 | u32 manufacturer[3]; |
144 | 6 | u32 family, model; |
145 | 6 | u32 a, b, c, d; |
146 | 6 | u64 xcr0 = 0; |
147 | 6 | u32 features = 0; |
148 | | |
149 | | /* EAX=0: Highest Function Parameter and Manufacturer ID */ |
150 | 6 | cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2], |
151 | 6 | &manufacturer[1]); |
152 | 6 | if (max_leaf < 1) |
153 | 0 | goto out; |
154 | | |
155 | | /* EAX=1: Processor Info and Feature Bits */ |
156 | 6 | cpuid(1, 0, &a, &b, &c, &d); |
157 | 6 | family = (a >> 8) & 0xf; |
158 | 6 | model = (a >> 4) & 0xf; |
159 | 6 | if (family == 6 || family == 0xf) |
160 | 6 | model += (a >> 12) & 0xf0; |
161 | 6 | if (family == 0xf) |
162 | 6 | family += (a >> 20) & 0xff; |
163 | 6 | if (d & (1 << 26)) |
164 | 6 | features |= X86_CPU_FEATURE_SSE2; |
165 | | /* |
166 | | * No known CPUs have pclmulqdq without sse4.1, so in practice code |
167 | | * targeting pclmulqdq can use sse4.1 instructions. But to be safe, |
168 | | * explicitly check for both the pclmulqdq and sse4.1 bits. |
169 | | */ |
170 | 6 | if ((c & (1 << 1)) && (c & (1 << 19))) |
171 | 6 | features |= X86_CPU_FEATURE_PCLMULQDQ; |
172 | 6 | if (c & (1 << 27)) |
173 | 6 | xcr0 = read_xcr(0); |
174 | 6 | if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) |
175 | 6 | features |= X86_CPU_FEATURE_AVX; |
176 | | |
177 | 6 | if (max_leaf < 7) |
178 | 0 | goto out; |
179 | | |
180 | | /* EAX=7, ECX=0: Extended Features */ |
181 | 6 | cpuid(7, 0, &a, &b, &c, &d); |
182 | 6 | if (b & (1 << 8)) |
183 | 6 | features |= X86_CPU_FEATURE_BMI2; |
184 | 6 | if ((xcr0 & 0x6) == 0x6) { |
185 | 6 | if (b & (1 << 5)) |
186 | 6 | features |= X86_CPU_FEATURE_AVX2; |
187 | 6 | if (c & (1 << 10)) |
188 | 0 | features |= X86_CPU_FEATURE_VPCLMULQDQ; |
189 | 6 | } |
190 | 6 | if (os_supports_avx512(xcr0)) { |
191 | 0 | if (allow_512bit_vectors(manufacturer, family, model)) |
192 | 0 | features |= X86_CPU_FEATURE_ZMM; |
193 | 0 | if (b & (1 << 30)) |
194 | 0 | features |= X86_CPU_FEATURE_AVX512BW; |
195 | 0 | if (b & (1U << 31)) |
196 | 0 | features |= X86_CPU_FEATURE_AVX512VL; |
197 | 0 | if (c & (1 << 11)) |
198 | 0 | features |= X86_CPU_FEATURE_AVX512VNNI; |
199 | 0 | } |
200 | | |
201 | | /* EAX=7, ECX=1: Extended Features */ |
202 | 6 | cpuid(7, 1, &a, &b, &c, &d); |
203 | 6 | if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6)) |
204 | 0 | features |= X86_CPU_FEATURE_AVXVNNI; |
205 | | |
206 | 6 | out: |
207 | 6 | disable_cpu_features_for_testing(&features, x86_cpu_feature_table, |
208 | 6 | ARRAY_LEN(x86_cpu_feature_table)); |
209 | | |
210 | 6 | libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; |
211 | 6 | } |
212 | | |
213 | | #endif /* X86_CPU_FEATURES_KNOWN */ |