Coverage Report

Created: 2025-06-16 07:00

/src/libdeflate/lib/x86/cpu_features.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * x86/cpu_features.c - feature detection for x86 CPUs
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
#include "../cpu_features_common.h" /* must be included first */
29
#include "cpu_features.h"
30
31
#ifdef X86_CPU_FEATURES_KNOWN
32
/* Runtime x86 CPU feature detection is supported. */
33
34
/* Execute the CPUID instruction. */
35
static inline void
36
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
37
24
{
38
#ifdef _MSC_VER
39
  int result[4];
40
41
  __cpuidex(result, leaf, subleaf);
42
  *a = result[0];
43
  *b = result[1];
44
  *c = result[2];
45
  *d = result[3];
46
#else
47
24
  __asm__ volatile("cpuid" : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
48
24
       : "a" (leaf), "c" (subleaf));
49
24
#endif
50
24
}
51
52
/* Read an extended control register. */
53
static inline u64
54
read_xcr(u32 index)
55
6
{
56
#ifdef _MSC_VER
57
  return _xgetbv(index);
58
#else
59
6
  u32 d, a;
60
61
  /*
62
   * Execute the "xgetbv" instruction.  Old versions of binutils do not
63
   * recognize this instruction, so list the raw bytes instead.
64
   *
65
   * This must be 'volatile' to prevent this code from being moved out
66
   * from under the check for OSXSAVE.
67
   */
68
6
  __asm__ volatile(".byte 0x0f, 0x01, 0xd0" :
69
6
       "=d" (d), "=a" (a) : "c" (index));
70
71
6
  return ((u64)d << 32) | a;
72
6
#endif
73
6
}
74
75
static const struct cpu_feature x86_cpu_feature_table[] = {
76
  {X86_CPU_FEATURE_SSE2,    "sse2"},
77
  {X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"},
78
  {X86_CPU_FEATURE_AVX,   "avx"},
79
  {X86_CPU_FEATURE_AVX2,    "avx2"},
80
  {X86_CPU_FEATURE_BMI2,    "bmi2"},
81
  {X86_CPU_FEATURE_ZMM,   "zmm"},
82
  {X86_CPU_FEATURE_AVX512BW,  "avx512bw"},
83
  {X86_CPU_FEATURE_AVX512VL,  "avx512vl"},
84
  {X86_CPU_FEATURE_VPCLMULQDQ,  "vpclmulqdq"},
85
  {X86_CPU_FEATURE_AVX512VNNI,  "avx512_vnni"},
86
  {X86_CPU_FEATURE_AVXVNNI, "avx_vnni"},
87
};
88
89
volatile u32 libdeflate_x86_cpu_features = 0;
90
91
static inline bool
92
os_supports_avx512(u64 xcr0)
93
6
{
94
#ifdef __APPLE__
95
  /*
96
   * The Darwin kernel had a bug where it could corrupt the opmask
97
   * registers.  See
98
   * https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259
99
   * Darwin also does not initially set the XCR0 bits for AVX512, but they
100
   * are set if the thread tries to use AVX512 anyway.  Thus, to safely
101
   * and consistently use AVX512 on macOS we'd need to check the kernel
102
   * version as well as detect AVX512 support using a macOS-specific
103
   * method.  We don't bother with this, especially given Apple's
104
   * transition to arm64.
105
   */
106
  return false;
107
#else
108
6
  return (xcr0 & 0xe6) == 0xe6;
109
6
#endif
110
6
}
111
112
/*
113
 * Don't use 512-bit vectors (ZMM registers) on Intel CPUs before Rocket Lake
114
 * and Sapphire Rapids, due to the overly-eager downclocking which can reduce
115
 * the performance of workloads that use ZMM registers only occasionally.
116
 */
117
static inline bool
118
allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
119
0
{
120
#ifdef TEST_SUPPORT__DO_NOT_USE
121
  return true;
122
#endif
123
0
  if (memcmp(manufacturer, "GenuineIntel", 12) != 0)
124
0
    return true;
125
0
  if (family != 6)
126
0
    return true;
127
0
  switch (model) {
128
0
  case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */
129
0
  case 106: /* Ice Lake (Server) */
130
0
  case 108: /* Ice Lake (Server) */
131
0
  case 126: /* Ice Lake (Client) */
132
0
  case 140: /* Tiger Lake */
133
0
  case 141: /* Tiger Lake */
134
0
    return false;
135
0
  }
136
0
  return true;
137
0
}
138
139
/* Initialize libdeflate_x86_cpu_features. */
140
void libdeflate_init_x86_cpu_features(void)
141
6
{
142
6
  u32 max_leaf;
143
6
  u32 manufacturer[3];
144
6
  u32 family, model;
145
6
  u32 a, b, c, d;
146
6
  u64 xcr0 = 0;
147
6
  u32 features = 0;
148
149
  /* EAX=0: Highest Function Parameter and Manufacturer ID */
150
6
  cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2],
151
6
        &manufacturer[1]);
152
6
  if (max_leaf < 1)
153
0
    goto out;
154
155
  /* EAX=1: Processor Info and Feature Bits */
156
6
  cpuid(1, 0, &a, &b, &c, &d);
157
6
  family = (a >> 8) & 0xf;
158
6
  model = (a >> 4) & 0xf;
159
6
  if (family == 6 || family == 0xf)
160
6
    model += (a >> 12) & 0xf0;
161
6
  if (family == 0xf)
162
6
    family += (a >> 20) & 0xff;
163
6
  if (d & (1 << 26))
164
6
    features |= X86_CPU_FEATURE_SSE2;
165
  /*
166
   * No known CPUs have pclmulqdq without sse4.1, so in practice code
167
   * targeting pclmulqdq can use sse4.1 instructions.  But to be safe,
168
   * explicitly check for both the pclmulqdq and sse4.1 bits.
169
   */
170
6
  if ((c & (1 << 1)) && (c & (1 << 19)))
171
6
    features |= X86_CPU_FEATURE_PCLMULQDQ;
172
6
  if (c & (1 << 27))
173
6
    xcr0 = read_xcr(0);
174
6
  if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
175
6
    features |= X86_CPU_FEATURE_AVX;
176
177
6
  if (max_leaf < 7)
178
0
    goto out;
179
180
  /* EAX=7, ECX=0: Extended Features */
181
6
  cpuid(7, 0, &a, &b, &c, &d);
182
6
  if (b & (1 << 8))
183
6
    features |= X86_CPU_FEATURE_BMI2;
184
6
  if ((xcr0 & 0x6) == 0x6) {
185
6
    if (b & (1 << 5))
186
6
      features |= X86_CPU_FEATURE_AVX2;
187
6
    if (c & (1 << 10))
188
0
      features |= X86_CPU_FEATURE_VPCLMULQDQ;
189
6
  }
190
6
  if (os_supports_avx512(xcr0)) {
191
0
    if (allow_512bit_vectors(manufacturer, family, model))
192
0
      features |= X86_CPU_FEATURE_ZMM;
193
0
    if (b & (1 << 30))
194
0
      features |= X86_CPU_FEATURE_AVX512BW;
195
0
    if (b & (1U << 31))
196
0
      features |= X86_CPU_FEATURE_AVX512VL;
197
0
    if (c & (1 << 11))
198
0
      features |= X86_CPU_FEATURE_AVX512VNNI;
199
0
  }
200
201
  /* EAX=7, ECX=1: Extended Features */
202
6
  cpuid(7, 1, &a, &b, &c, &d);
203
6
  if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6))
204
0
    features |= X86_CPU_FEATURE_AVXVNNI;
205
206
6
out:
207
6
  disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
208
6
           ARRAY_LEN(x86_cpu_feature_table));
209
210
6
  libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
211
6
}
212
213
#endif /* X86_CPU_FEATURES_KNOWN */