/src/libjxl/third_party/highway/hwy/targets.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2019 Google LLC |
2 | | // SPDX-License-Identifier: Apache-2.0 |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | // you may not use this file except in compliance with the License. |
6 | | // You may obtain a copy of the License at |
7 | | // |
8 | | // http://www.apache.org/licenses/LICENSE-2.0 |
9 | | // |
10 | | // Unless required by applicable law or agreed to in writing, software |
11 | | // distributed under the License is distributed on an "AS IS" BASIS, |
12 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | // See the License for the specific language governing permissions and |
14 | | // limitations under the License. |
15 | | |
16 | | #include "hwy/targets.h" |
17 | | |
18 | | #include <stdint.h> |
19 | | #include <stdio.h> |
20 | | |
21 | | #include "hwy/base.h" |
22 | | #include "hwy/detect_targets.h" |
23 | | #include "hwy/highway.h" |
24 | | #include "hwy/per_target.h" // VectorBytes |
25 | | |
26 | | #if HWY_ARCH_X86 |
27 | | #include <xmmintrin.h> |
28 | | #if HWY_COMPILER_MSVC |
29 | | #include <intrin.h> |
30 | | #else // !HWY_COMPILER_MSVC |
31 | | #include <cpuid.h> |
32 | | #endif // HWY_COMPILER_MSVC |
33 | | |
34 | | #elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV) && \ |
35 | | HWY_OS_LINUX |
36 | | // sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we |
37 | | // still include this directly. See #1199. |
38 | | #ifndef TOOLCHAIN_MISS_ASM_HWCAP_H |
39 | | #include <asm/hwcap.h> |
40 | | #endif |
41 | | #if HWY_HAVE_AUXV |
42 | | #include <sys/auxv.h> |
43 | | #endif |
44 | | |
45 | | #endif // HWY_ARCH_* |
46 | | |
47 | | #if HWY_OS_APPLE |
48 | | #include <sys/sysctl.h> |
49 | | #include <sys/utsname.h> |
50 | | #endif // HWY_OS_APPLE |
51 | | |
52 | | namespace hwy { |
53 | | namespace { |
54 | | |
55 | | // When running tests, this value can be set to the mocked supported targets |
56 | | // mask. Only written to from a single thread before the test starts. |
57 | | int64_t supported_targets_for_test_ = 0; |
58 | | |
59 | | // Mask of targets disabled at runtime with DisableTargets. |
60 | | int64_t supported_mask_ = LimitsMax<int64_t>(); |
61 | | |
62 | | #if HWY_OS_APPLE |
63 | | static HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature( |
64 | | const char* feature_name) { |
65 | | int result = 0; |
66 | | size_t len = sizeof(int); |
67 | | return (sysctlbyname(feature_name, &result, &len, nullptr, 0) == 0 && |
68 | | result != 0); |
69 | | } |
70 | | |
71 | | static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr, |
72 | | uint32_t& parsed_val) { |
73 | | uint64_t parsed_u64 = 0; |
74 | | |
75 | | const char* start_ptr = ptr; |
76 | | for (char ch; (ch = (*ptr)) != '\0'; ++ptr) { |
77 | | unsigned digit = static_cast<unsigned>(static_cast<unsigned char>(ch)) - |
78 | | static_cast<unsigned>(static_cast<unsigned char>('0')); |
79 | | if (digit > 9u) { |
80 | | break; |
81 | | } |
82 | | |
83 | | parsed_u64 = (parsed_u64 * 10u) + digit; |
84 | | if (parsed_u64 > 0xFFFFFFFFu) { |
85 | | return false; |
86 | | } |
87 | | } |
88 | | |
89 | | parsed_val = static_cast<uint32_t>(parsed_u64); |
90 | | return (ptr != start_ptr); |
91 | | } |
92 | | |
93 | | static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() { |
94 | | utsname uname_buf; |
95 | | ZeroBytes(&uname_buf, sizeof(utsname)); |
96 | | |
97 | | if ((uname(&uname_buf)) != 0) { |
98 | | return false; |
99 | | } |
100 | | |
101 | | const char* ptr = uname_buf.release; |
102 | | if (!ptr) { |
103 | | return false; |
104 | | } |
105 | | |
106 | | uint32_t major; |
107 | | uint32_t minor; |
108 | | if (!ParseU32(ptr, major)) { |
109 | | return false; |
110 | | } |
111 | | |
112 | | if (*ptr != '.') { |
113 | | return false; |
114 | | } |
115 | | |
116 | | ++ptr; |
117 | | if (!ParseU32(ptr, minor)) { |
118 | | return false; |
119 | | } |
120 | | |
121 | | // We are running on macOS 12.2 or later if the Darwin kernel version is 21.3 |
122 | | // or later |
123 | | return (major > 21 || (major == 21 && minor >= 3)); |
124 | | } |
125 | | #endif // HWY_OS_APPLE |
126 | | |
127 | | #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH |
128 | | namespace x86 { |
129 | | |
130 | | // Calls CPUID instruction with eax=level and ecx=count and returns the result |
131 | | // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). |
132 | | HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count, |
133 | 12 | uint32_t* HWY_RESTRICT abcd) { |
134 | | #if HWY_COMPILER_MSVC |
135 | | int regs[4]; |
136 | | __cpuidex(regs, level, count); |
137 | | for (int i = 0; i < 4; ++i) { |
138 | | abcd[i] = regs[i]; |
139 | | } |
140 | | #else // HWY_COMPILER_MSVC |
141 | 12 | uint32_t a; |
142 | 12 | uint32_t b; |
143 | 12 | uint32_t c; |
144 | 12 | uint32_t d; |
145 | 12 | __cpuid_count(level, count, a, b, c, d); |
146 | 12 | abcd[0] = a; |
147 | 12 | abcd[1] = b; |
148 | 12 | abcd[2] = c; |
149 | 12 | abcd[3] = d; |
150 | 12 | #endif // HWY_COMPILER_MSVC |
151 | 12 | } |
152 | | |
153 | 70 | HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) { |
154 | 70 | return (reg & (1U << index)) != 0; |
155 | 70 | } |
156 | | |
157 | | // Returns the lower 32 bits of extended control register 0. |
158 | | // Requires CPU support for "OSXSAVE" (see below). |
159 | 2 | uint32_t ReadXCR0() { |
160 | | #if HWY_COMPILER_MSVC |
161 | | return static_cast<uint32_t>(_xgetbv(0)); |
162 | | #else // HWY_COMPILER_MSVC |
163 | 2 | uint32_t xcr0, xcr0_high; |
164 | 2 | const uint32_t index = 0; |
165 | 2 | asm volatile(".byte 0x0F, 0x01, 0xD0" |
166 | 2 | : "=a"(xcr0), "=d"(xcr0_high) |
167 | 2 | : "c"(index)); |
168 | 2 | return xcr0; |
169 | 2 | #endif // HWY_COMPILER_MSVC |
170 | 2 | } |
171 | | |
172 | 0 | bool IsAMD() { |
173 | 0 | uint32_t abcd[4]; |
174 | 0 | Cpuid(0, 0, abcd); |
175 | 0 | const uint32_t max_level = abcd[0]; |
176 | 0 | return max_level >= 1 && abcd[1] == 0x68747541 && abcd[2] == 0x444d4163 && |
177 | 0 | abcd[3] == 0x69746e65; |
178 | 0 | } |
179 | | |
180 | | // Arbitrary bit indices indicating which instruction set extensions are |
181 | | // supported. Use enum to ensure values are distinct. |
182 | | enum class FeatureIndex : uint32_t { |
183 | | kSSE = 0, |
184 | | kSSE2, |
185 | | kSSE3, |
186 | | kSSSE3, |
187 | | |
188 | | kSSE41, |
189 | | kSSE42, |
190 | | kCLMUL, |
191 | | kAES, |
192 | | |
193 | | kAVX, |
194 | | kAVX2, |
195 | | kF16C, |
196 | | kFMA, |
197 | | kLZCNT, |
198 | | kBMI, |
199 | | kBMI2, |
200 | | |
201 | | kAVX512F, |
202 | | kAVX512VL, |
203 | | kAVX512CD, |
204 | | kAVX512DQ, |
205 | | kAVX512BW, |
206 | | kAVX512FP16, |
207 | | kAVX512BF16, |
208 | | |
209 | | kVNNI, |
210 | | kVPCLMULQDQ, |
211 | | kVBMI, |
212 | | kVBMI2, |
213 | | kVAES, |
214 | | kPOPCNTDQ, |
215 | | kBITALG, |
216 | | kGFNI, |
217 | | |
218 | | kSentinel |
219 | | }; |
220 | | static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64, |
221 | | "Too many bits for u64"); |
222 | | |
223 | 60 | HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { |
224 | 60 | return 1ull << static_cast<size_t>(index); |
225 | 60 | } |
226 | | |
227 | | // Returns bit array of FeatureIndex from CPUID feature flags. |
228 | 2 | uint64_t FlagsFromCPUID() { |
229 | 2 | uint64_t flags = 0; // return value |
230 | 2 | uint32_t abcd[4]; |
231 | 2 | Cpuid(0, 0, abcd); |
232 | 2 | const uint32_t max_level = abcd[0]; |
233 | | |
234 | | // Standard feature flags |
235 | 2 | Cpuid(1, 0, abcd); |
236 | 2 | flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0; |
237 | 2 | flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0; |
238 | 2 | flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0; |
239 | 2 | flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0; |
240 | 2 | flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0; |
241 | 2 | flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0; |
242 | 2 | flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0; |
243 | 2 | flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0; |
244 | 2 | flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0; |
245 | 2 | flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0; |
246 | 2 | flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0; |
247 | | |
248 | | // Extended feature flags |
249 | 2 | Cpuid(0x80000001U, 0, abcd); |
250 | 2 | flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0; |
251 | | |
252 | | // Extended features |
253 | 2 | if (max_level >= 7) { |
254 | 2 | Cpuid(7, 0, abcd); |
255 | 2 | flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0; |
256 | 2 | flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0; |
257 | 2 | flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0; |
258 | | |
259 | 2 | flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0; |
260 | 2 | flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0; |
261 | 2 | flags |= IsBitSet(abcd[1], 28) ? Bit(FeatureIndex::kAVX512CD) : 0; |
262 | 2 | flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0; |
263 | 2 | flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0; |
264 | | |
265 | 2 | flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0; |
266 | 2 | flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0; |
267 | 2 | flags |= IsBitSet(abcd[2], 8) ? Bit(FeatureIndex::kGFNI) : 0; |
268 | 2 | flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0; |
269 | 2 | flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0; |
270 | 2 | flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0; |
271 | 2 | flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0; |
272 | 2 | flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0; |
273 | | |
274 | 2 | flags |= IsBitSet(abcd[3], 23) ? Bit(FeatureIndex::kAVX512FP16) : 0; |
275 | | |
276 | 2 | Cpuid(7, 1, abcd); |
277 | 2 | flags |= IsBitSet(abcd[0], 5) ? Bit(FeatureIndex::kAVX512BF16) : 0; |
278 | 2 | } |
279 | | |
280 | 2 | return flags; |
281 | 2 | } |
282 | | |
283 | | // Each Highway target requires a 'group' of multiple features/flags. |
284 | | constexpr uint64_t kGroupSSE2 = |
285 | | Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2); |
286 | | |
287 | | constexpr uint64_t kGroupSSSE3 = |
288 | | Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2; |
289 | | |
290 | | constexpr uint64_t kGroupSSE4 = |
291 | | Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | |
292 | | Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3; |
293 | | |
294 | | // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to |
295 | | // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them |
296 | | // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of |
297 | | // avoiding using and requiring these so AVX2 can still be used. |
298 | | #ifdef HWY_DISABLE_BMI2_FMA |
299 | | constexpr uint64_t kGroupBMI2_FMA = 0; |
300 | | #else |
301 | | constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | |
302 | | Bit(FeatureIndex::kBMI2) | |
303 | | Bit(FeatureIndex::kFMA); |
304 | | #endif |
305 | | |
306 | | #ifdef HWY_DISABLE_F16C |
307 | | constexpr uint64_t kGroupF16C = 0; |
308 | | #else |
309 | | constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); |
310 | | #endif |
311 | | |
312 | | constexpr uint64_t kGroupAVX2 = |
313 | | Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) | |
314 | | Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4; |
315 | | |
316 | | constexpr uint64_t kGroupAVX3 = |
317 | | Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) | |
318 | | Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | |
319 | | Bit(FeatureIndex::kAVX512CD) | kGroupAVX2; |
320 | | |
321 | | constexpr uint64_t kGroupAVX3_DL = |
322 | | Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) | |
323 | | Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) | |
324 | | Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) | |
325 | | Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3; |
326 | | |
327 | | constexpr uint64_t kGroupAVX3_ZEN4 = |
328 | | Bit(FeatureIndex::kAVX512BF16) | kGroupAVX3_DL; |
329 | | |
330 | | constexpr uint64_t kGroupAVX3_SPR = |
331 | | Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4; |
332 | | |
333 | 2 | int64_t DetectTargets() { |
334 | 2 | int64_t bits = 0; // return value of supported targets. |
335 | 2 | HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) { |
336 | 2 | bits |= HWY_SSE2; // always present in x64 |
337 | 2 | } |
338 | | |
339 | 2 | const uint64_t flags = FlagsFromCPUID(); |
340 | | // Set target bit(s) if all their group's flags are all set. |
341 | 2 | if ((flags & kGroupAVX3_SPR) == kGroupAVX3_SPR) { |
342 | 0 | bits |= HWY_AVX3_SPR; |
343 | 0 | } |
344 | 2 | if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) { |
345 | 0 | bits |= HWY_AVX3_DL; |
346 | 0 | } |
347 | 2 | if ((flags & kGroupAVX3) == kGroupAVX3) { |
348 | 0 | bits |= HWY_AVX3; |
349 | 0 | } |
350 | 2 | if ((flags & kGroupAVX2) == kGroupAVX2) { |
351 | 2 | bits |= HWY_AVX2; |
352 | 2 | } |
353 | 2 | if ((flags & kGroupSSE4) == kGroupSSE4) { |
354 | 2 | bits |= HWY_SSE4; |
355 | 2 | } |
356 | 2 | if ((flags & kGroupSSSE3) == kGroupSSSE3) { |
357 | 2 | bits |= HWY_SSSE3; |
358 | 2 | } |
359 | 2 | HWY_IF_CONSTEXPR(HWY_ARCH_X86_32) { |
360 | | if ((flags & kGroupSSE2) == kGroupSSE2) { |
361 | | bits |= HWY_SSE2; |
362 | | } |
363 | | } |
364 | | |
365 | | // Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise, |
366 | | // YMM/ZMM registers are not preserved across context switches. |
367 | | |
368 | | // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across |
369 | | // context switches on x86_64 |
370 | | |
371 | | // The following OS's are known to preserve the lower 128 bits of XMM |
372 | | // registers across context switches on x86 CPU's that support SSE (even in |
373 | | // 32-bit mode): |
374 | | // - Windows 2000 or later |
375 | | // - Linux 2.4.0 or later |
376 | | // - Mac OS X 10.4 or later |
377 | | // - FreeBSD 4.4 or later |
378 | | // - NetBSD 1.6 or later |
379 | | // - OpenBSD 3.5 or later |
380 | | // - UnixWare 7 Release 7.1.1 or later |
381 | | // - Solaris 9 4/04 or later |
382 | | |
383 | 2 | uint32_t abcd[4]; |
384 | 2 | Cpuid(1, 0, abcd); |
385 | 2 | const bool has_xsave = IsBitSet(abcd[2], 26); |
386 | 2 | const bool has_osxsave = IsBitSet(abcd[2], 27); |
387 | 2 | constexpr int64_t min_avx2 = HWY_AVX2 | (HWY_AVX2 - 1); |
388 | | |
389 | 2 | if (has_xsave && has_osxsave) { |
390 | | #if HWY_OS_APPLE |
391 | | // On macOS, check for AVX3 XSAVE support by checking that we are running on |
392 | | // macOS 12.2 or later and HasCpuFeature("hw.optional.avx512f") returns true |
393 | | |
394 | | // There is a bug in macOS 12.1 or earlier that can cause ZMM16-ZMM31, the |
395 | | // upper 256 bits of the ZMM registers, and K0-K7 (the AVX512 mask |
396 | | // registers) to not be properly preserved across a context switch on |
397 | | // macOS 12.1 or earlier. |
398 | | |
399 | | // This bug on macOS 12.1 or earlier on x86_64 CPU's with AVX3 support is |
400 | | // described at |
401 | | // https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259, |
402 | | // https://github.com/golang/go/issues/49233, and |
403 | | // https://github.com/simdutf/simdutf/pull/236. |
404 | | |
405 | | // In addition to the bug that is there on macOS 12.1 or earlier, bits 5, 6, |
406 | | // and 7 can be set to 0 on x86_64 CPU's with AVX3 support on macOS until |
407 | | // the first AVX512 instruction is executed as macOS only preserves |
408 | | // ZMM16-ZMM31, the upper 256 bits of the ZMM registers, and K0-K7 across a |
409 | | // context switch on threads that have executed an AVX512 instruction. |
410 | | |
411 | | // Checking for AVX3 XSAVE support on macOS using |
412 | | // HasCpuFeature("hw.optional.avx512f") avoids false negative results |
413 | | // on x86_64 CPU's that have AVX3 support. |
414 | | const bool have_avx3_xsave_support = |
415 | | IsMacOs12_2OrLater() && HasCpuFeature("hw.optional.avx512f"); |
416 | | #endif |
417 | | |
418 | 2 | const uint32_t xcr0 = ReadXCR0(); |
419 | 2 | constexpr int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_SPR; |
420 | | // XMM/YMM |
421 | 2 | if (!IsBitSet(xcr0, 1) || !IsBitSet(xcr0, 2)) { |
422 | | // Clear the AVX2/AVX3 bits if XMM/YMM XSAVE is not enabled |
423 | 0 | bits &= ~min_avx2; |
424 | 0 | } |
425 | | |
426 | 2 | #if !HWY_OS_APPLE |
427 | | // On OS's other than macOS, check for AVX3 XSAVE support by checking that |
428 | | // bits 5, 6, and 7 of XCR0 are set. |
429 | 2 | const bool have_avx3_xsave_support = |
430 | 2 | IsBitSet(xcr0, 5) && IsBitSet(xcr0, 6) && IsBitSet(xcr0, 7); |
431 | 2 | #endif |
432 | | |
433 | | // opmask, ZMM lo/hi |
434 | 2 | if (!have_avx3_xsave_support) { |
435 | 2 | bits &= ~min_avx3; |
436 | 2 | } |
437 | 2 | } else { // !has_xsave || !has_osxsave |
438 | | // Clear the AVX2/AVX3 bits if the CPU or OS does not support XSAVE |
439 | 0 | bits &= ~min_avx2; |
440 | 0 | } |
441 | | |
442 | | // This is mainly to work around the slow Zen4 CompressStore. It's unclear |
443 | | // whether subsequent AMD models will be affected; assume yes. |
444 | 2 | if ((bits & HWY_AVX3_DL) && (flags & kGroupAVX3_ZEN4) == kGroupAVX3_ZEN4 && |
445 | 2 | IsAMD()) { |
446 | 0 | bits |= HWY_AVX3_ZEN4; |
447 | 0 | } |
448 | | |
449 | 2 | return bits; |
450 | 2 | } |
451 | | |
452 | | } // namespace x86 |
453 | | #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH |
454 | | namespace arm { |
455 | | int64_t DetectTargets() { |
456 | | int64_t bits = 0; // return value of supported targets. |
457 | | |
458 | | using CapBits = unsigned long; // NOLINT |
459 | | #if HWY_OS_APPLE |
460 | | const CapBits hw = 0UL; |
461 | | #else |
462 | | // For Android, this has been supported since API 20 (2014). |
463 | | const CapBits hw = getauxval(AT_HWCAP); |
464 | | #endif |
465 | | (void)hw; |
466 | | |
467 | | #if HWY_ARCH_ARM_A64 |
468 | | bits |= HWY_NEON_WITHOUT_AES; // aarch64 always has NEON and VFPv4.. |
469 | | |
470 | | #if HWY_OS_APPLE |
471 | | if (HasCpuFeature("hw.optional.arm.FEAT_AES")) { |
472 | | bits |= HWY_NEON; |
473 | | |
474 | | if (HasCpuFeature("hw.optional.AdvSIMD_HPFPCvt") && |
475 | | HasCpuFeature("hw.optional.arm.FEAT_DotProd") && |
476 | | HasCpuFeature("hw.optional.arm.FEAT_BF16")) { |
477 | | bits |= HWY_NEON_BF16; |
478 | | } |
479 | | } |
480 | | #else // !HWY_OS_APPLE |
481 | | // .. but not necessarily AES, which is required for HWY_NEON. |
482 | | #if defined(HWCAP_AES) |
483 | | if (hw & HWCAP_AES) { |
484 | | bits |= HWY_NEON; |
485 | | |
486 | | #if defined(HWCAP_ASIMDHP) && defined(HWCAP_ASIMDDP) && defined(HWCAP2_BF16) |
487 | | const CapBits hw2 = getauxval(AT_HWCAP2); |
488 | | const int64_t kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP; |
489 | | if ((hw & kGroupF16Dot) == kGroupF16Dot && (hw2 & HWCAP2_BF16)) { |
490 | | bits |= HWY_NEON_BF16; |
491 | | } |
492 | | #endif // HWCAP_ASIMDHP && HWCAP_ASIMDDP && HWCAP2_BF16 |
493 | | } |
494 | | #endif // HWCAP_AES |
495 | | |
496 | | #if defined(HWCAP_SVE) |
497 | | if (hw & HWCAP_SVE) { |
498 | | bits |= HWY_SVE; |
499 | | } |
500 | | #endif |
501 | | |
502 | | #ifndef HWCAP2_SVE2 |
503 | | #define HWCAP2_SVE2 (1 << 1) |
504 | | #endif |
505 | | #ifndef HWCAP2_SVEAES |
506 | | #define HWCAP2_SVEAES (1 << 2) |
507 | | #endif |
508 | | const CapBits hw2 = getauxval(AT_HWCAP2); |
509 | | if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) { |
510 | | bits |= HWY_SVE2; |
511 | | } |
512 | | #endif // HWY_OS_APPLE |
513 | | |
514 | | #else // !HWY_ARCH_ARM_A64 |
515 | | |
516 | | // Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported. |
517 | | #if defined(HWCAP_NEON) && defined(HWCAP_VFPv4) |
518 | | if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) { |
519 | | bits |= HWY_NEON_WITHOUT_AES; |
520 | | } |
521 | | #endif |
522 | | |
523 | | // aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet |
524 | | // support that platform, and Armv7 lacks AES entirely. Because HWY_NEON |
525 | | // requires native AES instructions, we do not enable that target here. |
526 | | |
527 | | #endif // HWY_ARCH_ARM_A64 |
528 | | return bits; |
529 | | } |
530 | | } // namespace arm |
531 | | #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH |
532 | | namespace ppc { |
533 | | |
534 | | #ifndef PPC_FEATURE_HAS_ALTIVEC |
535 | | #define PPC_FEATURE_HAS_ALTIVEC 0x10000000 |
536 | | #endif |
537 | | |
538 | | #ifndef PPC_FEATURE_HAS_VSX |
539 | | #define PPC_FEATURE_HAS_VSX 0x00000080 |
540 | | #endif |
541 | | |
542 | | #ifndef PPC_FEATURE2_ARCH_2_07 |
543 | | #define PPC_FEATURE2_ARCH_2_07 0x80000000 |
544 | | #endif |
545 | | |
546 | | #ifndef PPC_FEATURE2_VEC_CRYPTO |
547 | | #define PPC_FEATURE2_VEC_CRYPTO 0x02000000 |
548 | | #endif |
549 | | |
550 | | #ifndef PPC_FEATURE2_ARCH_3_00 |
551 | | #define PPC_FEATURE2_ARCH_3_00 0x00800000 |
552 | | #endif |
553 | | |
554 | | #ifndef PPC_FEATURE2_ARCH_3_1 |
555 | | #define PPC_FEATURE2_ARCH_3_1 0x00040000 |
556 | | #endif |
557 | | |
558 | | using CapBits = unsigned long; // NOLINT |
559 | | |
560 | | // For AT_HWCAP, the others are for AT_HWCAP2 |
561 | | constexpr CapBits kGroupVSX = PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX; |
562 | | |
563 | | #if defined(HWY_DISABLE_PPC8_CRYPTO) |
564 | | constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07; |
565 | | #else |
566 | | constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO; |
567 | | #endif |
568 | | constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00; |
569 | | constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1; |
570 | | |
571 | | int64_t DetectTargets() { |
572 | | int64_t bits = 0; // return value of supported targets. |
573 | | |
574 | | #if defined(AT_HWCAP) && defined(AT_HWCAP2) |
575 | | const CapBits hw = getauxval(AT_HWCAP); |
576 | | |
577 | | if ((hw & kGroupVSX) == kGroupVSX) { |
578 | | const CapBits hw2 = getauxval(AT_HWCAP2); |
579 | | if ((hw2 & kGroupPPC8) == kGroupPPC8) { |
580 | | bits |= HWY_PPC8; |
581 | | } |
582 | | if ((hw2 & kGroupPPC9) == kGroupPPC9) { |
583 | | bits |= HWY_PPC9; |
584 | | } |
585 | | if ((hw2 & kGroupPPC10) == kGroupPPC10) { |
586 | | bits |= HWY_PPC10; |
587 | | } |
588 | | } // VSX |
589 | | #endif // defined(AT_HWCAP) && defined(AT_HWCAP2) |
590 | | |
591 | | return bits; |
592 | | } |
593 | | } // namespace ppc |
594 | | #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH |
595 | | namespace s390x { |
596 | | |
597 | | #ifndef HWCAP_S390_VX |
598 | | #define HWCAP_S390_VX 2048 |
599 | | #endif |
600 | | |
601 | | #ifndef HWCAP_S390_VXE |
602 | | #define HWCAP_S390_VXE 8192 |
603 | | #endif |
604 | | |
605 | | #ifndef HWCAP_S390_VXRS_EXT2 |
606 | | #define HWCAP_S390_VXRS_EXT2 32768 |
607 | | #endif |
608 | | |
609 | | using CapBits = unsigned long; // NOLINT |
610 | | |
611 | | constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE; |
612 | | constexpr CapBits kGroupZ15 = |
613 | | HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2; |
614 | | |
615 | | int64_t DetectTargets() { |
616 | | int64_t bits = 0; |
617 | | |
618 | | #if defined(AT_HWCAP) |
619 | | const CapBits hw = getauxval(AT_HWCAP); |
620 | | |
621 | | if ((hw & kGroupZ14) == kGroupZ14) { |
622 | | bits |= HWY_Z14; |
623 | | } |
624 | | |
625 | | if ((hw & kGroupZ15) == kGroupZ15) { |
626 | | bits |= HWY_Z15; |
627 | | } |
628 | | #endif |
629 | | |
630 | | return bits; |
631 | | } |
632 | | } // namespace s390x |
633 | | #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH |
634 | | namespace rvv { |
635 | | |
636 | | #ifndef HWCAP_RVV |
637 | | #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A')) |
638 | | #endif |
639 | | |
640 | | using CapBits = unsigned long; // NOLINT |
641 | | |
642 | | int64_t DetectTargets() { |
643 | | int64_t bits = 0; |
644 | | |
645 | | const CapBits hw = getauxval(AT_HWCAP); |
646 | | |
647 | | if ((hw & COMPAT_HWCAP_ISA_V) == COMPAT_HWCAP_ISA_V) { |
648 | | size_t e8m1_vec_len; |
649 | | #if HWY_ARCH_RISCV_64 |
650 | | int64_t vtype_reg_val; |
651 | | #else |
652 | | int32_t vtype_reg_val; |
653 | | #endif |
654 | | |
655 | | // Check that a vuint8m1_t vector is at least 16 bytes and that tail |
656 | | // agnostic and mask agnostic mode are supported |
657 | | asm volatile( |
658 | | // Avoid compiler error on GCC or Clang if -march=rv64gcv1p0 or |
659 | | // -march=rv32gcv1p0 option is not specified on the command line |
660 | | ".option push\n\t" |
661 | | ".option arch, +v\n\t" |
662 | | "vsetvli %0, zero, e8, m1, ta, ma\n\t" |
663 | | "csrr %1, vtype\n\t" |
664 | | ".option pop" |
665 | | : "=r"(e8m1_vec_len), "=r"(vtype_reg_val)); |
666 | | |
667 | | // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of |
668 | | // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16 |
669 | | // bytes |
670 | | if (vtype_reg_val >= 0 && e8m1_vec_len >= 16) { |
671 | | bits |= HWY_RVV; |
672 | | } |
673 | | } |
674 | | |
675 | | return bits; |
676 | | } |
677 | | } // namespace rvv |
678 | | #endif // HWY_ARCH_* |
679 | | |
680 | | // Returns targets supported by the CPU, independently of DisableTargets. |
681 | | // Factored out of SupportedTargets to make its structure more obvious. Note |
682 | | // that x86 CPUID may take several hundred cycles. |
683 | 2 | int64_t DetectTargets() { |
684 | | // Apps will use only one of these (the default is EMU128), but compile flags |
685 | | // for this TU may differ from that of the app, so allow both. |
686 | 2 | int64_t bits = HWY_SCALAR | HWY_EMU128; |
687 | | |
688 | 2 | #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH |
689 | 2 | bits |= x86::DetectTargets(); |
690 | | #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH |
691 | | bits |= arm::DetectTargets(); |
692 | | #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH |
693 | | bits |= ppc::DetectTargets(); |
694 | | #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH |
695 | | bits |= s390x::DetectTargets(); |
696 | | #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH |
697 | | bits |= rvv::DetectTargets(); |
698 | | |
699 | | #else |
700 | | // TODO(janwas): detect support for WASM. |
701 | | // This file is typically compiled without HWY_IS_TEST, but targets_test has |
702 | | // it set, and will expect all of its HWY_TARGETS (= all attainable) to be |
703 | | // supported. |
704 | | bits |= HWY_ENABLED_BASELINE; |
705 | | #endif // HWY_ARCH_* |
706 | | |
707 | 2 | if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { |
708 | 0 | const uint64_t bits_u = static_cast<uint64_t>(bits); |
709 | 0 | const uint64_t enabled = static_cast<uint64_t>(HWY_ENABLED_BASELINE); |
710 | 0 | fprintf(stderr, |
711 | 0 | "WARNING: CPU supports 0x%08x%08x, software requires 0x%08x%08x\n", |
712 | 0 | static_cast<uint32_t>(bits_u >> 32), |
713 | 0 | static_cast<uint32_t>(bits_u & 0xFFFFFFFF), |
714 | 0 | static_cast<uint32_t>(enabled >> 32), |
715 | 0 | static_cast<uint32_t>(enabled & 0xFFFFFFFF)); |
716 | 0 | } |
717 | | |
718 | 2 | return bits; |
719 | 2 | } |
720 | | |
721 | | } // namespace |
722 | | |
723 | 0 | HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { |
724 | 0 | supported_mask_ = static_cast<int64_t>(~disabled_targets); |
725 | | // This will take effect on the next call to SupportedTargets, which is |
726 | | // called right before GetChosenTarget::Update. However, calling Update here |
727 | | // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want |
728 | | // to check in tests. We instead de-initialize such that the next |
729 | | // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache. |
730 | 0 | GetChosenTarget().DeInit(); |
731 | 0 | } |
732 | | |
733 | 0 | HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { |
734 | 0 | supported_targets_for_test_ = targets; |
735 | 0 | GetChosenTarget().DeInit(); // see comment above |
736 | 0 | } |
737 | | |
738 | 2 | HWY_DLLEXPORT int64_t SupportedTargets() { |
739 | 2 | int64_t targets = supported_targets_for_test_; |
740 | 2 | if (HWY_LIKELY(targets == 0)) { |
741 | | // Mock not active. Re-detect instead of caching just in case we're on a |
742 | | // heterogeneous ISA (also requires some app support to pin threads). This |
743 | | // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to |
744 | | // DisableTargets or SetSupportedTargetsForTest. |
745 | 2 | targets = DetectTargets(); |
746 | | |
747 | | // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion, |
748 | | // first set up ChosenTarget. No need to Update() again afterwards with the |
749 | | // final targets - that will be done by a caller of this function. |
750 | 2 | GetChosenTarget().Update(targets); |
751 | | |
752 | | // Now that we can call VectorBytes, check for targets with specific sizes. |
753 | 2 | if (HWY_ARCH_ARM_A64) { |
754 | 0 | const size_t vec_bytes = VectorBytes(); // uncached, see declaration |
755 | 0 | if ((targets & HWY_SVE) && vec_bytes == 32) { |
756 | 0 | targets = static_cast<int64_t>(targets | HWY_SVE_256); |
757 | 0 | } else { |
758 | 0 | targets = static_cast<int64_t>(targets & ~HWY_SVE_256); |
759 | 0 | } |
760 | 0 | if ((targets & HWY_SVE2) && vec_bytes == 16) { |
761 | 0 | targets = static_cast<int64_t>(targets | HWY_SVE2_128); |
762 | 0 | } else { |
763 | 0 | targets = static_cast<int64_t>(targets & ~HWY_SVE2_128); |
764 | 0 | } |
765 | 0 | } // HWY_ARCH_ARM_A64 |
766 | 2 | } |
767 | | |
768 | 2 | targets &= supported_mask_; |
769 | 2 | return targets == 0 ? HWY_STATIC_TARGET : targets; |
770 | 2 | } |
771 | | |
772 | 8.46M | HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { |
773 | 8.46M | static ChosenTarget chosen_target; |
774 | 8.46M | return chosen_target; |
775 | 8.46M | } |
776 | | |
777 | | } // namespace hwy |