/src/boringssl/crypto/cpu_intel.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include <openssl/base.h> |
16 | | |
17 | | #if !defined(OPENSSL_NO_ASM) && \ |
18 | | (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) |
19 | | |
20 | | #include <errno.h> |
21 | | #include <inttypes.h> |
22 | | #include <limits.h> |
23 | | #include <stdlib.h> |
24 | | #include <string.h> |
25 | | |
26 | | #if defined(_MSC_VER) |
27 | | #include <immintrin.h> |
28 | | #include <intrin.h> |
29 | | #endif |
30 | | |
31 | | #include "internal.h" |
32 | | |
33 | | |
34 | | // OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX |
35 | | // is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through |
36 | | // |*out_edx|. |
37 | | static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, |
38 | 3 | uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { |
39 | | #if defined(_MSC_VER) |
40 | | int tmp[4]; |
41 | | __cpuid(tmp, (int)leaf); |
42 | | *out_eax = (uint32_t)tmp[0]; |
43 | | *out_ebx = (uint32_t)tmp[1]; |
44 | | *out_ecx = (uint32_t)tmp[2]; |
45 | | *out_edx = (uint32_t)tmp[3]; |
46 | | #elif defined(__pic__) && defined(OPENSSL_32_BIT) |
47 | | // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. |
48 | | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. |
49 | | __asm__ volatile( |
50 | | "xor %%ecx, %%ecx\n" |
51 | | "mov %%ebx, %%edi\n" |
52 | | "cpuid\n" |
53 | | "xchg %%edi, %%ebx\n" |
54 | | : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) |
55 | | : "a"(leaf)); |
56 | | #else |
57 | 3 | __asm__ volatile( |
58 | 3 | "xor %%ecx, %%ecx\n" |
59 | 3 | "cpuid\n" |
60 | 3 | : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) |
61 | 3 | : "a"(leaf)); |
62 | 3 | #endif |
63 | 3 | } |
64 | | |
65 | | // OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). |
66 | | // Currently only XCR0 is defined by Intel so |xcr| should always be zero. |
67 | 1 | static uint64_t OPENSSL_xgetbv(uint32_t xcr) { |
68 | | #if defined(_MSC_VER) |
69 | | return (uint64_t)_xgetbv(xcr); |
70 | | #else |
71 | 1 | uint32_t eax, edx; |
72 | 1 | __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); |
73 | 1 | return (((uint64_t)edx) << 32) | eax; |
74 | 1 | #endif |
75 | 1 | } |
76 | | |
77 | 1 | static bool os_supports_avx512(uint64_t xcr0) { |
78 | | #if defined(__APPLE__) |
79 | | // The Darwin kernel had a bug where it could corrupt the opmask registers. |
80 | | // See |
81 | | // https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259 |
82 | | // Darwin also does not initially set the XCR0 bits for AVX512, but they are |
83 | | // set if the thread tries to use AVX512 anyway. Thus, to safely and |
84 | | // consistently use AVX512 on macOS we'd need to check the kernel version as |
85 | | // well as detect AVX512 support using a macOS-specific method. We don't |
86 | | // bother with this, especially given Apple's transition to arm64. |
87 | | return false; |
88 | | #else |
89 | 1 | return (xcr0 & 0xe6) == 0xe6; |
90 | 1 | #endif |
91 | 1 | } |
92 | | |
93 | | // handle_cpu_env applies the value from |in| to the CPUID values in |out[0]| |
94 | | // and |out[1]|. See the comment in |OPENSSL_cpuid_setup| about this. The |
95 | | // |is_last| argument specifies whether the value is at the end of the string. |
96 | | // Otherwise it may be followed by a colon. |
97 | 0 | static void handle_cpu_env(uint32_t out[2], const char *in, bool is_last) { |
98 | 0 | const int invert_op = in[0] == '~'; |
99 | 0 | const int or_op = in[0] == '|'; |
100 | 0 | const int skip_first_byte = invert_op || or_op; |
101 | 0 | const int hex = in[skip_first_byte] == '0' && in[skip_first_byte + 1] == 'x'; |
102 | 0 | const int base = hex ? 16 : 10; |
103 | |
|
104 | 0 | const char *start = in + skip_first_byte; |
105 | 0 | char *end; |
106 | 0 | errno = 0; |
107 | | // We need to parse 64-bit values with `strtoull`. |
108 | 0 | static_assert(sizeof(unsigned long long) == sizeof(uint64_t)); |
109 | 0 | unsigned long long v = strtoull(start, &end, base); |
110 | |
|
111 | 0 | if (end == start || (*end != '\0' && (is_last || *end != ':')) || |
112 | 0 | (v == ULLONG_MAX && errno == ERANGE)) { |
113 | 0 | return; |
114 | 0 | } |
115 | | |
116 | 0 | if (invert_op) { |
117 | 0 | out[0] &= ~v; |
118 | 0 | out[1] &= ~(v >> 32); |
119 | 0 | } else if (or_op) { |
120 | 0 | out[0] |= v; |
121 | 0 | out[1] |= (v >> 32); |
122 | 0 | } else { |
123 | 0 | out[0] = v; |
124 | 0 | out[1] = v >> 32; |
125 | 0 | } |
126 | 0 | } |
127 | | |
128 | 0 | void OPENSSL_adjust_ia32cap(uint32_t cap[4], const char *env) { |
129 | | // OPENSSL_ia32cap can contain zero, one or two values, separated with a ':'. |
130 | | // Each value is a 64-bit, unsigned value which may start with "0x" to |
131 | | // indicate a hex value. Prior to the 64-bit value, a '~' or '|' may be given. |
132 | | // |
133 | | // If the '~' prefix is present: |
134 | | // the value is inverted and ANDed with the probed CPUID result |
135 | | // If the '|' prefix is present: |
136 | | // the value is ORed with the probed CPUID result |
137 | | // Otherwise: |
138 | | // the value is taken as the result of the CPUID |
139 | | // |
140 | | // The first value determines OPENSSL_ia32cap_P[0] and [1]. The second [2] |
141 | | // and [3]. |
142 | 0 | handle_cpu_env(cap, env, /*is_last=*/false); |
143 | 0 | env = strchr(env, ':'); |
144 | 0 | if (env != nullptr) { |
145 | 0 | handle_cpu_env(cap + 2, env + 1, /*is_last=*/true); |
146 | 0 | } |
147 | 0 | } |
148 | | |
149 | 1 | void OPENSSL_cpuid_setup(void) { |
150 | | // Determine the vendor and maximum input value. |
151 | 1 | uint32_t eax, ebx, ecx, edx; |
152 | 1 | OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); |
153 | | |
154 | 1 | uint32_t num_ids = eax; |
155 | | |
156 | 1 | int is_intel = ebx == 0x756e6547 /* Genu */ && // |
157 | 1 | edx == 0x49656e69 /* ineI */ && // |
158 | 1 | ecx == 0x6c65746e /* ntel */; |
159 | 1 | int is_amd = ebx == 0x68747541 /* Auth */ && // |
160 | 1 | edx == 0x69746e65 /* enti */ && // |
161 | 1 | ecx == 0x444d4163 /* cAMD */; |
162 | | |
163 | 1 | uint32_t extended_features[2] = {0}; |
164 | 1 | if (num_ids >= 7) { |
165 | 1 | OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); |
166 | 1 | extended_features[0] = ebx; |
167 | 1 | extended_features[1] = ecx; |
168 | 1 | } |
169 | | |
170 | 1 | OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); |
171 | | |
172 | 1 | const uint32_t base_family = (eax >> 8) & 15; |
173 | 1 | const uint32_t base_model = (eax >> 4) & 15; |
174 | | |
175 | 1 | uint32_t family = base_family; |
176 | 1 | uint32_t model = base_model; |
177 | 1 | if (base_family == 15) { |
178 | 1 | const uint32_t ext_family = (eax >> 20) & 255; |
179 | 1 | family += ext_family; |
180 | 1 | } |
181 | 1 | if (base_family == 6 || base_family == 15) { |
182 | 1 | const uint32_t ext_model = (eax >> 16) & 15; |
183 | 1 | model |= ext_model << 4; |
184 | 1 | } |
185 | | |
186 | 1 | if (is_amd) { |
187 | 1 | if (family < 0x17 || (family == 0x17 && 0x70 <= model && model <= 0x7f)) { |
188 | | // Disable RDRAND on AMD families before 0x17 (Zen) due to reported |
189 | | // failures after suspend. |
190 | | // https://bugzilla.redhat.com/show_bug.cgi?id=1150286 |
191 | | // Also disable for family 0x17, models 0x70–0x7f, due to possible RDRAND |
192 | | // failures there too. |
193 | 0 | ecx &= ~(1u << 30); |
194 | 0 | } |
195 | 1 | } |
196 | | |
197 | | // Reserved bit #30 is repurposed to signal an Intel CPU. |
198 | 1 | if (is_intel) { |
199 | 0 | edx |= (1u << 30); |
200 | 1 | } else { |
201 | 1 | edx &= ~(1u << 30); |
202 | 1 | } |
203 | | |
204 | 1 | uint64_t xcr0 = 0; |
205 | 1 | if (ecx & (1u << 27)) { |
206 | | // XCR0 may only be queried if the OSXSAVE bit is set. |
207 | 1 | xcr0 = OPENSSL_xgetbv(0); |
208 | 1 | } |
209 | | // See Intel manual, volume 1, section 14.3. |
210 | 1 | if ((xcr0 & 6) != 6) { |
211 | | // YMM registers cannot be used. |
212 | 0 | ecx &= ~(1u << 28); // AVX |
213 | 0 | ecx &= ~(1u << 12); // FMA |
214 | 0 | ecx &= ~(1u << 11); // AMD XOP |
215 | 0 | extended_features[0] &= ~(1u << 5); // AVX2 |
216 | 0 | extended_features[1] &= ~(1u << 9); // VAES |
217 | 0 | extended_features[1] &= ~(1u << 10); // VPCLMULQDQ |
218 | 0 | } |
219 | | // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation |
220 | | // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups |
221 | | // Operating at 256 and 128-bit Vector Lengths"). |
222 | 1 | if (!os_supports_avx512(xcr0)) { |
223 | | // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM |
224 | | // registers, masking, SIMD registers 16-31 (even if accessed as YMM or |
225 | | // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only |
226 | | // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on |
227 | | // shorter vectors, since AVX512 ties everything to the availability of |
228 | | // 512-bit vectors. See the above-mentioned sections of the Intel manual, |
229 | | // which say that *all* these XCR0 bits must be checked even when just using |
230 | | // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD |
231 | | // Equations for EVEX") which says that all EVEX-coded instructions raise an |
232 | | // undefined-instruction exception if any of these XCR0 bits is zero. |
233 | 1 | extended_features[0] &= ~(1u << 16); // AVX512F |
234 | 1 | extended_features[0] &= ~(1u << 17); // AVX512DQ |
235 | 1 | extended_features[0] &= ~(1u << 21); // AVX512IFMA |
236 | 1 | extended_features[0] &= ~(1u << 26); // AVX512PF |
237 | 1 | extended_features[0] &= ~(1u << 27); // AVX512ER |
238 | 1 | extended_features[0] &= ~(1u << 28); // AVX512CD |
239 | 1 | extended_features[0] &= ~(1u << 30); // AVX512BW |
240 | 1 | extended_features[0] &= ~(1u << 31); // AVX512VL |
241 | 1 | extended_features[1] &= ~(1u << 1); // AVX512VBMI |
242 | 1 | extended_features[1] &= ~(1u << 6); // AVX512VBMI2 |
243 | 1 | extended_features[1] &= ~(1u << 11); // AVX512VNNI |
244 | 1 | extended_features[1] &= ~(1u << 12); // AVX512BITALG |
245 | 1 | extended_features[1] &= ~(1u << 14); // AVX512VPOPCNTDQ |
246 | 1 | } |
247 | | |
248 | | // Repurpose the bit for the removed MPX feature to indicate when using zmm |
249 | | // registers should be avoided even when they are supported. (When set, AVX512 |
250 | | // features can still be used, but only using ymm or xmm registers.) Skylake |
251 | | // suffered from severe downclocking when zmm registers were used, which |
252 | | // affected unrelated code running on the system, making zmm registers not too |
253 | | // useful outside of benchmarks. The situation improved significantly by Ice |
254 | | // Lake, but a small amount of downclocking remained. (See |
255 | | // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/) |
256 | | // We take a conservative approach of not allowing zmm registers until after |
257 | | // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side. |
258 | | // |
259 | | // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported |
260 | | // to have any downclocking problem when zmm registers are used. |
261 | 1 | if (is_intel && family == 6 && |
262 | 1 | (model == 85 || // Skylake, Cascade Lake, Cooper Lake (server) |
263 | 0 | model == 106 || // Ice Lake (server) |
264 | 0 | model == 108 || // Ice Lake (micro server) |
265 | 0 | model == 125 || // Ice Lake (client) |
266 | 0 | model == 126 || // Ice Lake (mobile) |
267 | 0 | model == 140 || // Tiger Lake (mobile) |
268 | 0 | model == 141)) { // Tiger Lake (client) |
269 | 0 | extended_features[0] |= 1u << 14; |
270 | 1 | } else { |
271 | 1 | extended_features[0] &= ~(1u << 14); |
272 | 1 | } |
273 | | |
274 | 1 | OPENSSL_ia32cap_P[0] = edx; |
275 | 1 | OPENSSL_ia32cap_P[1] = ecx; |
276 | 1 | OPENSSL_ia32cap_P[2] = extended_features[0]; |
277 | 1 | OPENSSL_ia32cap_P[3] = extended_features[1]; |
278 | | |
279 | 1 | const char *env = getenv("OPENSSL_ia32cap"); |
280 | 1 | if (env != nullptr) { |
281 | 0 | OPENSSL_adjust_ia32cap(OPENSSL_ia32cap_P, env); |
282 | 0 | } |
283 | 1 | } |
284 | | |
285 | | #endif // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) |