Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/third_party/highway/hwy/targets.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2019 Google LLC
2
// SPDX-License-Identifier: Apache-2.0
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
//      http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
16
#include "hwy/targets.h"
17
18
#include <stdint.h>
19
#include <stdio.h>
20
21
#include "hwy/base.h"
22
#include "hwy/detect_targets.h"
23
#include "hwy/highway.h"
24
#include "hwy/per_target.h"  // VectorBytes
25
26
#if HWY_ARCH_X86
27
#include <xmmintrin.h>
28
#if HWY_COMPILER_MSVC
29
#include <intrin.h>
30
#else  // !HWY_COMPILER_MSVC
31
#include <cpuid.h>
32
#endif  // HWY_COMPILER_MSVC
33
34
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV) && \
35
    HWY_OS_LINUX
36
// sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we
37
// still include this directly. See #1199.
38
#ifndef TOOLCHAIN_MISS_ASM_HWCAP_H
39
#include <asm/hwcap.h>
40
#endif
41
#if HWY_HAVE_AUXV
42
#include <sys/auxv.h>
43
#endif
44
45
#endif  // HWY_ARCH_*
46
47
#if HWY_OS_APPLE
48
#include <sys/sysctl.h>
49
#include <sys/utsname.h>
50
#endif  // HWY_OS_APPLE
51
52
namespace hwy {
53
namespace {
54
55
// When running tests, this value can be set to the mocked supported targets
56
// mask. Only written to from a single thread before the test starts.
57
int64_t supported_targets_for_test_ = 0;
58
59
// Mask of targets disabled at runtime with DisableTargets.
60
int64_t supported_mask_ = LimitsMax<int64_t>();
61
62
#if HWY_OS_APPLE
63
static HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature(
64
    const char* feature_name) {
65
  int result = 0;
66
  size_t len = sizeof(int);
67
  return (sysctlbyname(feature_name, &result, &len, nullptr, 0) == 0 &&
68
          result != 0);
69
}
70
71
static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr,
72
                                                 uint32_t& parsed_val) {
73
  uint64_t parsed_u64 = 0;
74
75
  const char* start_ptr = ptr;
76
  for (char ch; (ch = (*ptr)) != '\0'; ++ptr) {
77
    unsigned digit = static_cast<unsigned>(static_cast<unsigned char>(ch)) -
78
                     static_cast<unsigned>(static_cast<unsigned char>('0'));
79
    if (digit > 9u) {
80
      break;
81
    }
82
83
    parsed_u64 = (parsed_u64 * 10u) + digit;
84
    if (parsed_u64 > 0xFFFFFFFFu) {
85
      return false;
86
    }
87
  }
88
89
  parsed_val = static_cast<uint32_t>(parsed_u64);
90
  return (ptr != start_ptr);
91
}
92
93
static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() {
94
  utsname uname_buf;
95
  ZeroBytes(&uname_buf, sizeof(utsname));
96
97
  if ((uname(&uname_buf)) != 0) {
98
    return false;
99
  }
100
101
  const char* ptr = uname_buf.release;
102
  if (!ptr) {
103
    return false;
104
  }
105
106
  uint32_t major;
107
  uint32_t minor;
108
  if (!ParseU32(ptr, major)) {
109
    return false;
110
  }
111
112
  if (*ptr != '.') {
113
    return false;
114
  }
115
116
  ++ptr;
117
  if (!ParseU32(ptr, minor)) {
118
    return false;
119
  }
120
121
  // We are running on macOS 12.2 or later if the Darwin kernel version is 21.3
122
  // or later
123
  return (major > 21 || (major == 21 && minor >= 3));
124
}
125
#endif  // HWY_OS_APPLE
126
127
#if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
128
namespace x86 {
129
130
// Calls CPUID instruction with eax=level and ecx=count and returns the result
131
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
132
HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
133
12
                      uint32_t* HWY_RESTRICT abcd) {
134
#if HWY_COMPILER_MSVC
135
  int regs[4];
136
  __cpuidex(regs, level, count);
137
  for (int i = 0; i < 4; ++i) {
138
    abcd[i] = regs[i];
139
  }
140
#else   // HWY_COMPILER_MSVC
141
12
  uint32_t a;
142
12
  uint32_t b;
143
12
  uint32_t c;
144
12
  uint32_t d;
145
12
  __cpuid_count(level, count, a, b, c, d);
146
12
  abcd[0] = a;
147
12
  abcd[1] = b;
148
12
  abcd[2] = c;
149
12
  abcd[3] = d;
150
12
#endif  // HWY_COMPILER_MSVC
151
12
}
152
153
70
HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
154
70
  return (reg & (1U << index)) != 0;
155
70
}
156
157
// Returns the lower 32 bits of extended control register 0.
158
// Requires CPU support for "OSXSAVE" (see below).
159
2
uint32_t ReadXCR0() {
160
#if HWY_COMPILER_MSVC
161
  return static_cast<uint32_t>(_xgetbv(0));
162
#else   // HWY_COMPILER_MSVC
163
2
  uint32_t xcr0, xcr0_high;
164
2
  const uint32_t index = 0;
165
2
  asm volatile(".byte 0x0F, 0x01, 0xD0"
166
2
               : "=a"(xcr0), "=d"(xcr0_high)
167
2
               : "c"(index));
168
2
  return xcr0;
169
2
#endif  // HWY_COMPILER_MSVC
170
2
}
171
172
0
bool IsAMD() {
173
0
  uint32_t abcd[4];
174
0
  Cpuid(0, 0, abcd);
175
0
  const uint32_t max_level = abcd[0];
176
0
  return max_level >= 1 && abcd[1] == 0x68747541 && abcd[2] == 0x444d4163 &&
177
0
         abcd[3] == 0x69746e65;
178
0
}
179
180
// Arbitrary bit indices indicating which instruction set extensions are
181
// supported. Use enum to ensure values are distinct.
182
enum class FeatureIndex : uint32_t {
183
  kSSE = 0,
184
  kSSE2,
185
  kSSE3,
186
  kSSSE3,
187
188
  kSSE41,
189
  kSSE42,
190
  kCLMUL,
191
  kAES,
192
193
  kAVX,
194
  kAVX2,
195
  kF16C,
196
  kFMA,
197
  kLZCNT,
198
  kBMI,
199
  kBMI2,
200
201
  kAVX512F,
202
  kAVX512VL,
203
  kAVX512CD,
204
  kAVX512DQ,
205
  kAVX512BW,
206
  kAVX512FP16,
207
  kAVX512BF16,
208
209
  kVNNI,
210
  kVPCLMULQDQ,
211
  kVBMI,
212
  kVBMI2,
213
  kVAES,
214
  kPOPCNTDQ,
215
  kBITALG,
216
  kGFNI,
217
218
  kSentinel
219
};
220
static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
221
              "Too many bits for u64");
222
223
60
HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
224
60
  return 1ull << static_cast<size_t>(index);
225
60
}
226
227
// Returns bit array of FeatureIndex from CPUID feature flags.
228
2
uint64_t FlagsFromCPUID() {
229
2
  uint64_t flags = 0;  // return value
230
2
  uint32_t abcd[4];
231
2
  Cpuid(0, 0, abcd);
232
2
  const uint32_t max_level = abcd[0];
233
234
  // Standard feature flags
235
2
  Cpuid(1, 0, abcd);
236
2
  flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
237
2
  flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
238
2
  flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
239
2
  flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
240
2
  flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
241
2
  flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
242
2
  flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
243
2
  flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
244
2
  flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
245
2
  flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
246
2
  flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
247
248
  // Extended feature flags
249
2
  Cpuid(0x80000001U, 0, abcd);
250
2
  flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
251
252
  // Extended features
253
2
  if (max_level >= 7) {
254
2
    Cpuid(7, 0, abcd);
255
2
    flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
256
2
    flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
257
2
    flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
258
259
2
    flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
260
2
    flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
261
2
    flags |= IsBitSet(abcd[1], 28) ? Bit(FeatureIndex::kAVX512CD) : 0;
262
2
    flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
263
2
    flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
264
265
2
    flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
266
2
    flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
267
2
    flags |= IsBitSet(abcd[2], 8) ? Bit(FeatureIndex::kGFNI) : 0;
268
2
    flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
269
2
    flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
270
2
    flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
271
2
    flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
272
2
    flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
273
274
2
    flags |= IsBitSet(abcd[3], 23) ? Bit(FeatureIndex::kAVX512FP16) : 0;
275
276
2
    Cpuid(7, 1, abcd);
277
2
    flags |= IsBitSet(abcd[0], 5) ? Bit(FeatureIndex::kAVX512BF16) : 0;
278
2
  }
279
280
2
  return flags;
281
2
}
282
283
// Each Highway target requires a 'group' of multiple features/flags.
284
constexpr uint64_t kGroupSSE2 =
285
    Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2);
286
287
constexpr uint64_t kGroupSSSE3 =
288
    Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2;
289
290
constexpr uint64_t kGroupSSE4 =
291
    Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
292
    Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
293
294
// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
295
// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
296
// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
297
// avoiding using and requiring these so AVX2 can still be used.
298
#ifdef HWY_DISABLE_BMI2_FMA
299
constexpr uint64_t kGroupBMI2_FMA = 0;
300
#else
301
constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
302
                                    Bit(FeatureIndex::kBMI2) |
303
                                    Bit(FeatureIndex::kFMA);
304
#endif
305
306
#ifdef HWY_DISABLE_F16C
307
constexpr uint64_t kGroupF16C = 0;
308
#else
309
constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
310
#endif
311
312
constexpr uint64_t kGroupAVX2 =
313
    Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
314
    Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
315
316
constexpr uint64_t kGroupAVX3 =
317
    Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
318
    Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) |
319
    Bit(FeatureIndex::kAVX512CD) | kGroupAVX2;
320
321
constexpr uint64_t kGroupAVX3_DL =
322
    Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
323
    Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
324
    Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
325
    Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3;
326
327
constexpr uint64_t kGroupAVX3_ZEN4 =
328
    Bit(FeatureIndex::kAVX512BF16) | kGroupAVX3_DL;
329
330
constexpr uint64_t kGroupAVX3_SPR =
331
    Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4;
332
333
2
int64_t DetectTargets() {
334
2
  int64_t bits = 0;  // return value of supported targets.
335
2
  HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) {
336
2
    bits |= HWY_SSE2;  // always present in x64
337
2
  }
338
339
2
  const uint64_t flags = FlagsFromCPUID();
340
  // Set target bit(s) if all their group's flags are all set.
341
2
  if ((flags & kGroupAVX3_SPR) == kGroupAVX3_SPR) {
342
0
    bits |= HWY_AVX3_SPR;
343
0
  }
344
2
  if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
345
0
    bits |= HWY_AVX3_DL;
346
0
  }
347
2
  if ((flags & kGroupAVX3) == kGroupAVX3) {
348
0
    bits |= HWY_AVX3;
349
0
  }
350
2
  if ((flags & kGroupAVX2) == kGroupAVX2) {
351
2
    bits |= HWY_AVX2;
352
2
  }
353
2
  if ((flags & kGroupSSE4) == kGroupSSE4) {
354
2
    bits |= HWY_SSE4;
355
2
  }
356
2
  if ((flags & kGroupSSSE3) == kGroupSSSE3) {
357
2
    bits |= HWY_SSSE3;
358
2
  }
359
2
  HWY_IF_CONSTEXPR(HWY_ARCH_X86_32) {
360
    if ((flags & kGroupSSE2) == kGroupSSE2) {
361
      bits |= HWY_SSE2;
362
    }
363
  }
364
365
  // Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise,
366
  // YMM/ZMM registers are not preserved across context switches.
367
368
  // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across
369
  // context switches on x86_64
370
371
  // The following OS's are known to preserve the lower 128 bits of XMM
372
  // registers across context switches on x86 CPU's that support SSE (even in
373
  // 32-bit mode):
374
  // - Windows 2000 or later
375
  // - Linux 2.4.0 or later
376
  // - Mac OS X 10.4 or later
377
  // - FreeBSD 4.4 or later
378
  // - NetBSD 1.6 or later
379
  // - OpenBSD 3.5 or later
380
  // - UnixWare 7 Release 7.1.1 or later
381
  // - Solaris 9 4/04 or later
382
383
2
  uint32_t abcd[4];
384
2
  Cpuid(1, 0, abcd);
385
2
  const bool has_xsave = IsBitSet(abcd[2], 26);
386
2
  const bool has_osxsave = IsBitSet(abcd[2], 27);
387
2
  constexpr int64_t min_avx2 = HWY_AVX2 | (HWY_AVX2 - 1);
388
389
2
  if (has_xsave && has_osxsave) {
390
#if HWY_OS_APPLE
391
    // On macOS, check for AVX3 XSAVE support by checking that we are running on
392
    // macOS 12.2 or later and HasCpuFeature("hw.optional.avx512f") returns true
393
394
    // There is a bug in macOS 12.1 or earlier that can cause ZMM16-ZMM31, the
395
    // upper 256 bits of the ZMM registers, and K0-K7 (the AVX512 mask
396
    // registers) to not be properly preserved across a context switch on
397
    // macOS 12.1 or earlier.
398
399
    // This bug on macOS 12.1 or earlier on x86_64 CPU's with AVX3 support is
400
    // described at
401
    // https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259,
402
    // https://github.com/golang/go/issues/49233, and
403
    // https://github.com/simdutf/simdutf/pull/236.
404
405
    // In addition to the bug that is there on macOS 12.1 or earlier, bits 5, 6,
406
    // and 7 can be set to 0 on x86_64 CPU's with AVX3 support on macOS until
407
    // the first AVX512 instruction is executed as macOS only preserves
408
    // ZMM16-ZMM31, the upper 256 bits of the ZMM registers, and K0-K7 across a
409
    // context switch on threads that have executed an AVX512 instruction.
410
411
    // Checking for AVX3 XSAVE support on macOS using
412
    // HasCpuFeature("hw.optional.avx512f") avoids false negative results
413
    // on x86_64 CPU's that have AVX3 support.
414
    const bool have_avx3_xsave_support =
415
        IsMacOs12_2OrLater() && HasCpuFeature("hw.optional.avx512f");
416
#endif
417
418
2
    const uint32_t xcr0 = ReadXCR0();
419
2
    constexpr int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_SPR;
420
    // XMM/YMM
421
2
    if (!IsBitSet(xcr0, 1) || !IsBitSet(xcr0, 2)) {
422
      // Clear the AVX2/AVX3 bits if XMM/YMM XSAVE is not enabled
423
0
      bits &= ~min_avx2;
424
0
    }
425
426
2
#if !HWY_OS_APPLE
427
    // On OS's other than macOS, check for AVX3 XSAVE support by checking that
428
    // bits 5, 6, and 7 of XCR0 are set.
429
2
    const bool have_avx3_xsave_support =
430
2
        IsBitSet(xcr0, 5) && IsBitSet(xcr0, 6) && IsBitSet(xcr0, 7);
431
2
#endif
432
433
    // opmask, ZMM lo/hi
434
2
    if (!have_avx3_xsave_support) {
435
2
      bits &= ~min_avx3;
436
2
    }
437
2
  } else {  // !has_xsave || !has_osxsave
438
    // Clear the AVX2/AVX3 bits if the CPU or OS does not support XSAVE
439
0
    bits &= ~min_avx2;
440
0
  }
441
442
  // This is mainly to work around the slow Zen4 CompressStore. It's unclear
443
  // whether subsequent AMD models will be affected; assume yes.
444
2
  if ((bits & HWY_AVX3_DL) && (flags & kGroupAVX3_ZEN4) == kGroupAVX3_ZEN4 &&
445
2
      IsAMD()) {
446
0
    bits |= HWY_AVX3_ZEN4;
447
0
  }
448
449
2
  return bits;
450
2
}
451
452
}  // namespace x86
453
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
454
namespace arm {
455
int64_t DetectTargets() {
456
  int64_t bits = 0;  // return value of supported targets.
457
458
  using CapBits = unsigned long;  // NOLINT
459
#if HWY_OS_APPLE
460
  const CapBits hw = 0UL;
461
#else
462
  // For Android, this has been supported since API 20 (2014).
463
  const CapBits hw = getauxval(AT_HWCAP);
464
#endif
465
  (void)hw;
466
467
#if HWY_ARCH_ARM_A64
468
  bits |= HWY_NEON_WITHOUT_AES;  // aarch64 always has NEON and VFPv4..
469
470
#if HWY_OS_APPLE
471
  if (HasCpuFeature("hw.optional.arm.FEAT_AES")) {
472
    bits |= HWY_NEON;
473
474
    if (HasCpuFeature("hw.optional.AdvSIMD_HPFPCvt") &&
475
        HasCpuFeature("hw.optional.arm.FEAT_DotProd") &&
476
        HasCpuFeature("hw.optional.arm.FEAT_BF16")) {
477
      bits |= HWY_NEON_BF16;
478
    }
479
  }
480
#else  // !HWY_OS_APPLE
481
  // .. but not necessarily AES, which is required for HWY_NEON.
482
#if defined(HWCAP_AES)
483
  if (hw & HWCAP_AES) {
484
    bits |= HWY_NEON;
485
486
#if defined(HWCAP_ASIMDHP) && defined(HWCAP_ASIMDDP) && defined(HWCAP2_BF16)
487
    const CapBits hw2 = getauxval(AT_HWCAP2);
488
    const int64_t kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP;
489
    if ((hw & kGroupF16Dot) == kGroupF16Dot && (hw2 & HWCAP2_BF16)) {
490
      bits |= HWY_NEON_BF16;
491
    }
492
#endif  // HWCAP_ASIMDHP && HWCAP_ASIMDDP && HWCAP2_BF16
493
  }
494
#endif  // HWCAP_AES
495
496
#if defined(HWCAP_SVE)
497
  if (hw & HWCAP_SVE) {
498
    bits |= HWY_SVE;
499
  }
500
#endif
501
502
#ifndef HWCAP2_SVE2
503
#define HWCAP2_SVE2 (1 << 1)
504
#endif
505
#ifndef HWCAP2_SVEAES
506
#define HWCAP2_SVEAES (1 << 2)
507
#endif
508
  const CapBits hw2 = getauxval(AT_HWCAP2);
509
  if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
510
    bits |= HWY_SVE2;
511
  }
512
#endif  // HWY_OS_APPLE
513
514
#else  // !HWY_ARCH_ARM_A64
515
516
// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
517
#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
518
  if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
519
    bits |= HWY_NEON_WITHOUT_AES;
520
  }
521
#endif
522
523
  // aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet
524
  // support that platform, and Armv7 lacks AES entirely. Because HWY_NEON
525
  // requires native AES instructions, we do not enable that target here.
526
527
#endif  // HWY_ARCH_ARM_A64
528
  return bits;
529
}
530
}  // namespace arm
531
#elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
532
namespace ppc {
533
534
#ifndef PPC_FEATURE_HAS_ALTIVEC
535
#define PPC_FEATURE_HAS_ALTIVEC 0x10000000
536
#endif
537
538
#ifndef PPC_FEATURE_HAS_VSX
539
#define PPC_FEATURE_HAS_VSX 0x00000080
540
#endif
541
542
#ifndef PPC_FEATURE2_ARCH_2_07
543
#define PPC_FEATURE2_ARCH_2_07 0x80000000
544
#endif
545
546
#ifndef PPC_FEATURE2_VEC_CRYPTO
547
#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
548
#endif
549
550
#ifndef PPC_FEATURE2_ARCH_3_00
551
#define PPC_FEATURE2_ARCH_3_00 0x00800000
552
#endif
553
554
#ifndef PPC_FEATURE2_ARCH_3_1
555
#define PPC_FEATURE2_ARCH_3_1 0x00040000
556
#endif
557
558
using CapBits = unsigned long;  // NOLINT
559
560
// For AT_HWCAP, the others are for AT_HWCAP2
561
constexpr CapBits kGroupVSX = PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX;
562
563
#if defined(HWY_DISABLE_PPC8_CRYPTO)
564
constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07;
565
#else
566
constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO;
567
#endif
568
constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00;
569
constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1;
570
571
int64_t DetectTargets() {
572
  int64_t bits = 0;  // return value of supported targets.
573
574
#if defined(AT_HWCAP) && defined(AT_HWCAP2)
575
  const CapBits hw = getauxval(AT_HWCAP);
576
577
  if ((hw & kGroupVSX) == kGroupVSX) {
578
    const CapBits hw2 = getauxval(AT_HWCAP2);
579
    if ((hw2 & kGroupPPC8) == kGroupPPC8) {
580
      bits |= HWY_PPC8;
581
    }
582
    if ((hw2 & kGroupPPC9) == kGroupPPC9) {
583
      bits |= HWY_PPC9;
584
    }
585
    if ((hw2 & kGroupPPC10) == kGroupPPC10) {
586
      bits |= HWY_PPC10;
587
    }
588
  }  // VSX
589
#endif  // defined(AT_HWCAP) && defined(AT_HWCAP2)
590
591
  return bits;
592
}
593
}  // namespace ppc
594
#elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH
595
namespace s390x {
596
597
#ifndef HWCAP_S390_VX
598
#define HWCAP_S390_VX 2048
599
#endif
600
601
#ifndef HWCAP_S390_VXE
602
#define HWCAP_S390_VXE 8192
603
#endif
604
605
#ifndef HWCAP_S390_VXRS_EXT2
606
#define HWCAP_S390_VXRS_EXT2 32768
607
#endif
608
609
using CapBits = unsigned long;  // NOLINT
610
611
constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE;
612
constexpr CapBits kGroupZ15 =
613
    HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2;
614
615
int64_t DetectTargets() {
616
  int64_t bits = 0;
617
618
#if defined(AT_HWCAP)
619
  const CapBits hw = getauxval(AT_HWCAP);
620
621
  if ((hw & kGroupZ14) == kGroupZ14) {
622
    bits |= HWY_Z14;
623
  }
624
625
  if ((hw & kGroupZ15) == kGroupZ15) {
626
    bits |= HWY_Z15;
627
  }
628
#endif
629
630
  return bits;
631
}
632
}  // namespace s390x
633
#elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
634
namespace rvv {
635
636
#ifndef HWCAP_RVV
637
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
638
#endif
639
640
using CapBits = unsigned long;  // NOLINT
641
642
int64_t DetectTargets() {
643
  int64_t bits = 0;
644
645
  const CapBits hw = getauxval(AT_HWCAP);
646
647
  if ((hw & COMPAT_HWCAP_ISA_V) == COMPAT_HWCAP_ISA_V) {
648
    size_t e8m1_vec_len;
649
#if HWY_ARCH_RISCV_64
650
    int64_t vtype_reg_val;
651
#else
652
    int32_t vtype_reg_val;
653
#endif
654
655
    // Check that a vuint8m1_t vector is at least 16 bytes and that tail
656
    // agnostic and mask agnostic mode are supported
657
    asm volatile(
658
        // Avoid compiler error on GCC or Clang if -march=rv64gcv1p0 or
659
        // -march=rv32gcv1p0 option is not specified on the command line
660
        ".option push\n\t"
661
        ".option arch, +v\n\t"
662
        "vsetvli %0, zero, e8, m1, ta, ma\n\t"
663
        "csrr %1, vtype\n\t"
664
        ".option pop"
665
        : "=r"(e8m1_vec_len), "=r"(vtype_reg_val));
666
667
    // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of
668
    // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16
669
    // bytes
670
    if (vtype_reg_val >= 0 && e8m1_vec_len >= 16) {
671
      bits |= HWY_RVV;
672
    }
673
  }
674
675
  return bits;
676
}
677
}  // namespace rvv
678
#endif  // HWY_ARCH_*
679
680
// Returns targets supported by the CPU, independently of DisableTargets.
681
// Factored out of SupportedTargets to make its structure more obvious. Note
682
// that x86 CPUID may take several hundred cycles.
683
2
int64_t DetectTargets() {
684
  // Apps will use only one of these (the default is EMU128), but compile flags
685
  // for this TU may differ from that of the app, so allow both.
686
2
  int64_t bits = HWY_SCALAR | HWY_EMU128;
687
688
2
#if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
689
2
  bits |= x86::DetectTargets();
690
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
691
  bits |= arm::DetectTargets();
692
#elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
693
  bits |= ppc::DetectTargets();
694
#elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH
695
  bits |= s390x::DetectTargets();
696
#elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
697
  bits |= rvv::DetectTargets();
698
699
#else
700
  // TODO(janwas): detect support for WASM.
701
  // This file is typically compiled without HWY_IS_TEST, but targets_test has
702
  // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
703
  // supported.
704
  bits |= HWY_ENABLED_BASELINE;
705
#endif  // HWY_ARCH_*
706
707
2
  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
708
0
    const uint64_t bits_u = static_cast<uint64_t>(bits);
709
0
    const uint64_t enabled = static_cast<uint64_t>(HWY_ENABLED_BASELINE);
710
0
    fprintf(stderr,
711
0
            "WARNING: CPU supports 0x%08x%08x, software requires 0x%08x%08x\n",
712
0
            static_cast<uint32_t>(bits_u >> 32),
713
0
            static_cast<uint32_t>(bits_u & 0xFFFFFFFF),
714
0
            static_cast<uint32_t>(enabled >> 32),
715
0
            static_cast<uint32_t>(enabled & 0xFFFFFFFF));
716
0
  }
717
718
2
  return bits;
719
2
}
720
721
}  // namespace
722
723
0
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
724
0
  supported_mask_ = static_cast<int64_t>(~disabled_targets);
725
  // This will take effect on the next call to SupportedTargets, which is
726
  // called right before GetChosenTarget::Update. However, calling Update here
727
  // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
728
  // to check in tests. We instead de-initialize such that the next
729
  // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
730
0
  GetChosenTarget().DeInit();
731
0
}
732
733
0
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
734
0
  supported_targets_for_test_ = targets;
735
0
  GetChosenTarget().DeInit();  // see comment above
736
0
}
737
738
2
HWY_DLLEXPORT int64_t SupportedTargets() {
739
2
  int64_t targets = supported_targets_for_test_;
740
2
  if (HWY_LIKELY(targets == 0)) {
741
    // Mock not active. Re-detect instead of caching just in case we're on a
742
    // heterogeneous ISA (also requires some app support to pin threads). This
743
    // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
744
    // DisableTargets or SetSupportedTargetsForTest.
745
2
    targets = DetectTargets();
746
747
    // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
748
    // first set up ChosenTarget. No need to Update() again afterwards with the
749
    // final targets - that will be done by a caller of this function.
750
2
    GetChosenTarget().Update(targets);
751
752
    // Now that we can call VectorBytes, check for targets with specific sizes.
753
2
    if (HWY_ARCH_ARM_A64) {
754
0
      const size_t vec_bytes = VectorBytes();  // uncached, see declaration
755
0
      if ((targets & HWY_SVE) && vec_bytes == 32) {
756
0
        targets = static_cast<int64_t>(targets | HWY_SVE_256);
757
0
      } else {
758
0
        targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
759
0
      }
760
0
      if ((targets & HWY_SVE2) && vec_bytes == 16) {
761
0
        targets = static_cast<int64_t>(targets | HWY_SVE2_128);
762
0
      } else {
763
0
        targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
764
0
      }
765
0
    }  // HWY_ARCH_ARM_A64
766
2
  }
767
768
2
  targets &= supported_mask_;
769
2
  return targets == 0 ? HWY_STATIC_TARGET : targets;
770
2
}
771
772
8.46M
HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
773
8.46M
  static ChosenTarget chosen_target;
774
8.46M
  return chosen_target;
775
8.46M
}
776
777
}  // namespace hwy