/work/include/simdutf/internal/isadetection.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* From |
2 | | https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h |
3 | | Highly modified. |
4 | | |
5 | | Copyright (c) 2016- Facebook, Inc (Adam Paszke) |
6 | | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) |
7 | | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) |
8 | | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) |
9 | | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) |
10 | | Copyright (c) 2011-2013 NYU (Clement Farabet) |
11 | | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, |
12 | | Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute |
13 | | (Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, |
14 | | Samy Bengio, Johnny Mariethoz) |
15 | | |
16 | | All rights reserved. |
17 | | |
18 | | Redistribution and use in source and binary forms, with or without |
19 | | modification, are permitted provided that the following conditions are met: |
20 | | |
21 | | 1. Redistributions of source code must retain the above copyright |
22 | | notice, this list of conditions and the following disclaimer. |
23 | | |
24 | | 2. Redistributions in binary form must reproduce the above copyright |
25 | | notice, this list of conditions and the following disclaimer in the |
26 | | documentation and/or other materials provided with the distribution. |
27 | | |
28 | | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories |
29 | | America and IDIAP Research Institute nor the names of its contributors may be |
30 | | used to endorse or promote products derived from this software without |
31 | | specific prior written permission. |
32 | | |
33 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
34 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
35 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
36 | | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
37 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
38 | | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
39 | | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
40 | | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
41 | | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
42 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
43 | | POSSIBILITY OF SUCH DAMAGE. |
44 | | */ |
45 | | |
46 | | #ifndef SIMDutf_INTERNAL_ISADETECTION_H |
47 | | #define SIMDutf_INTERNAL_ISADETECTION_H |
48 | | |
49 | | #include <cstdint> |
50 | | #include <cstdlib> |
51 | | #if defined(_MSC_VER) |
52 | | #include <intrin.h> |
53 | | #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) |
54 | | #include <cpuid.h> |
55 | | #endif |
56 | | |
57 | | #include "simdutf/portability.h" |
58 | | |
59 | | // RISC-V ISA detection utilities |
60 | | #if SIMDUTF_IS_RISCV64 && defined(__linux__) |
61 | | #include <unistd.h> // for syscall |
62 | | // We define these ourselves, for backwards compatibility |
63 | | struct simdutf_riscv_hwprobe { |
64 | | int64_t key; |
65 | | uint64_t value; |
66 | | }; |
67 | | #define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__) |
68 | | #define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4 |
69 | | #define SIMDUTF_RISCV_HWPROBE_IMA_V (1 << 2) |
70 | | #define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17) |
71 | | #endif // SIMDUTF_IS_RISCV64 && defined(__linux__) |
72 | | |
73 | | #if defined(__loongarch__) && defined(__linux__) |
74 | | #include <sys/auxv.h> |
75 | | // bits/hwcap.h |
76 | | // #define HWCAP_LOONGARCH_LSX (1 << 4) |
77 | | // #define HWCAP_LOONGARCH_LASX (1 << 5) |
78 | | #endif |
79 | | |
80 | | namespace simdutf { |
81 | | namespace internal { |
82 | | |
83 | | enum instruction_set { |
84 | | DEFAULT = 0x0, |
85 | | NEON = 0x1, |
86 | | AVX2 = 0x4, |
87 | | SSE42 = 0x8, |
88 | | PCLMULQDQ = 0x10, |
89 | | BMI1 = 0x20, |
90 | | BMI2 = 0x40, |
91 | | ALTIVEC = 0x80, |
92 | | AVX512F = 0x100, |
93 | | AVX512DQ = 0x200, |
94 | | AVX512IFMA = 0x400, |
95 | | AVX512PF = 0x800, |
96 | | AVX512ER = 0x1000, |
97 | | AVX512CD = 0x2000, |
98 | | AVX512BW = 0x4000, |
99 | | AVX512VL = 0x8000, |
100 | | AVX512VBMI2 = 0x10000, |
101 | | AVX512VPOPCNTDQ = 0x2000, |
102 | | RVV = 0x4000, |
103 | | ZVBB = 0x8000, |
104 | | LSX = 0x40000, |
105 | | LASX = 0x80000, |
106 | | }; |
107 | | |
108 | | #if defined(__PPC64__) |
109 | | |
110 | | static inline uint32_t detect_supported_architectures() { |
111 | | return instruction_set::ALTIVEC; |
112 | | } |
113 | | |
114 | | #elif SIMDUTF_IS_RISCV64 |
115 | | |
116 | | static inline uint32_t detect_supported_architectures() { |
117 | | uint32_t host_isa = instruction_set::DEFAULT; |
118 | | #if SIMDUTF_IS_RVV |
119 | | host_isa |= instruction_set::RVV; |
120 | | #endif |
121 | | #if SIMDUTF_IS_ZVBB |
122 | | host_isa |= instruction_set::ZVBB; |
123 | | #endif |
124 | | #if defined(__linux__) |
125 | | simdutf_riscv_hwprobe probes[] = {{SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0}}; |
126 | | long ret = simdutf_riscv_hwprobe(&probes, sizeof probes / sizeof *probes, 0, |
127 | | nullptr, 0); |
128 | | if (ret == 0) { |
129 | | uint64_t extensions = probes[0].value; |
130 | | if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V) |
131 | | host_isa |= instruction_set::RVV; |
132 | | if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB) |
133 | | host_isa |= instruction_set::ZVBB; |
134 | | } |
135 | | #endif |
136 | | #if defined(RUN_IN_SPIKE_SIMULATOR) |
137 | | // Proxy Kernel does not implement yet hwprobe syscall |
138 | | host_isa |= instruction_set::RVV; |
139 | | #endif |
140 | | return host_isa; |
141 | | } |
142 | | |
143 | | #elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) |
144 | | |
145 | | static inline uint32_t detect_supported_architectures() { |
146 | | return instruction_set::NEON; |
147 | | } |
148 | | |
149 | | #elif defined(__x86_64__) || defined(_M_AMD64) // x64 |
150 | | |
151 | | namespace { |
152 | | namespace cpuid_bit { |
153 | | // Can be found on Intel ISA Reference for CPUID |
154 | | |
155 | | // EAX = 0x01 |
156 | | constexpr uint32_t pclmulqdq = uint32_t(1) |
157 | | << 1; ///< @private bit 1 of ECX for EAX=0x1 |
158 | | constexpr uint32_t sse42 = uint32_t(1) |
159 | | << 20; ///< @private bit 20 of ECX for EAX=0x1 |
160 | | constexpr uint32_t osxsave = |
161 | | (uint32_t(1) << 26) | |
162 | | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1 |
163 | | |
164 | | // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf) |
165 | | // See: "Table 3-8. Information Returned by CPUID Instruction" |
166 | | namespace ebx { |
167 | | constexpr uint32_t bmi1 = uint32_t(1) << 3; |
168 | | constexpr uint32_t avx2 = uint32_t(1) << 5; |
169 | | constexpr uint32_t bmi2 = uint32_t(1) << 8; |
170 | | constexpr uint32_t avx512f = uint32_t(1) << 16; |
171 | | constexpr uint32_t avx512dq = uint32_t(1) << 17; |
172 | | constexpr uint32_t avx512ifma = uint32_t(1) << 21; |
173 | | constexpr uint32_t avx512cd = uint32_t(1) << 28; |
174 | | constexpr uint32_t avx512bw = uint32_t(1) << 30; |
175 | | constexpr uint32_t avx512vl = uint32_t(1) << 31; |
176 | | } // namespace ebx |
177 | | |
178 | | namespace ecx { |
179 | | constexpr uint32_t avx512vbmi = uint32_t(1) << 1; |
180 | | constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6; |
181 | | constexpr uint32_t avx512vnni = uint32_t(1) << 11; |
182 | | constexpr uint32_t avx512bitalg = uint32_t(1) << 12; |
183 | | constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14; |
184 | | } // namespace ecx |
185 | | namespace edx { |
186 | | constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8; |
187 | | } |
188 | | namespace xcr0_bit { |
189 | | constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX |
190 | | constexpr uint64_t avx512_saved = |
191 | | uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM |
192 | | } // namespace xcr0_bit |
193 | | } // namespace cpuid_bit |
194 | | } // namespace |
195 | | |
196 | | static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, |
197 | 0 | uint32_t *edx) { |
198 | 0 | #if defined(_MSC_VER) |
199 | 0 | int cpu_info[4]; |
200 | 0 | __cpuidex(cpu_info, *eax, *ecx); |
201 | 0 | *eax = cpu_info[0]; |
202 | 0 | *ebx = cpu_info[1]; |
203 | 0 | *ecx = cpu_info[2]; |
204 | 0 | *edx = cpu_info[3]; |
205 | 0 | #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) |
206 | 0 | uint32_t level = *eax; |
207 | 0 | __get_cpuid(level, eax, ebx, ecx, edx); |
208 | 0 | #else |
209 | 0 | uint32_t a = *eax, b, c = *ecx, d; |
210 | 0 | asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); |
211 | 0 | *eax = a; |
212 | 0 | *ebx = b; |
213 | 0 | *ecx = c; |
214 | 0 | *edx = d; |
215 | 0 | #endif |
216 | 0 | } |
217 | | |
218 | 0 | static inline uint64_t xgetbv() { |
219 | 0 | #if defined(_MSC_VER) |
220 | 0 | return _xgetbv(0); |
221 | 0 | #else |
222 | 0 | uint32_t xcr0_lo, xcr0_hi; |
223 | 0 | asm volatile("xgetbv\n\t" : "=a"(xcr0_lo), "=d"(xcr0_hi) : "c"(0)); |
224 | 0 | return xcr0_lo | ((uint64_t)xcr0_hi << 32); |
225 | 0 | #endif |
226 | 0 | } |
227 | | |
228 | 0 | static inline uint32_t detect_supported_architectures() { |
229 | 0 | uint32_t eax; |
230 | 0 | uint32_t ebx = 0; |
231 | 0 | uint32_t ecx = 0; |
232 | 0 | uint32_t edx = 0; |
233 | 0 | uint32_t host_isa = 0x0; |
234 | 0 |
|
235 | 0 | // EBX for EAX=0x1 |
236 | 0 | eax = 0x1; |
237 | 0 | cpuid(&eax, &ebx, &ecx, &edx); |
238 | 0 |
|
239 | 0 | if (ecx & cpuid_bit::sse42) { |
240 | 0 | host_isa |= instruction_set::SSE42; |
241 | 0 | } |
242 | 0 |
|
243 | 0 | if (ecx & cpuid_bit::pclmulqdq) { |
244 | 0 | host_isa |= instruction_set::PCLMULQDQ; |
245 | 0 | } |
246 | 0 |
|
247 | 0 | if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) { |
248 | 0 | return host_isa; |
249 | 0 | } |
250 | 0 |
|
251 | 0 | // xgetbv for checking if the OS saves registers |
252 | 0 | uint64_t xcr0 = xgetbv(); |
253 | 0 |
|
254 | 0 | if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) { |
255 | 0 | return host_isa; |
256 | 0 | } |
257 | 0 | // ECX for EAX=0x7 |
258 | 0 | eax = 0x7; |
259 | 0 | ecx = 0x0; // Sub-leaf = 0 |
260 | 0 | cpuid(&eax, &ebx, &ecx, &edx); |
261 | 0 | if (ebx & cpuid_bit::ebx::avx2) { |
262 | 0 | host_isa |= instruction_set::AVX2; |
263 | 0 | } |
264 | 0 | if (ebx & cpuid_bit::ebx::bmi1) { |
265 | 0 | host_isa |= instruction_set::BMI1; |
266 | 0 | } |
267 | 0 | if (ebx & cpuid_bit::ebx::bmi2) { |
268 | 0 | host_isa |= instruction_set::BMI2; |
269 | 0 | } |
270 | 0 | if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == |
271 | 0 | cpuid_bit::xcr0_bit::avx512_saved)) { |
272 | 0 | return host_isa; |
273 | 0 | } |
274 | 0 | if (ebx & cpuid_bit::ebx::avx512f) { |
275 | 0 | host_isa |= instruction_set::AVX512F; |
276 | 0 | } |
277 | 0 | if (ebx & cpuid_bit::ebx::avx512bw) { |
278 | 0 | host_isa |= instruction_set::AVX512BW; |
279 | 0 | } |
280 | 0 | if (ebx & cpuid_bit::ebx::avx512cd) { |
281 | 0 | host_isa |= instruction_set::AVX512CD; |
282 | 0 | } |
283 | 0 | if (ebx & cpuid_bit::ebx::avx512dq) { |
284 | 0 | host_isa |= instruction_set::AVX512DQ; |
285 | 0 | } |
286 | 0 | if (ebx & cpuid_bit::ebx::avx512vl) { |
287 | 0 | host_isa |= instruction_set::AVX512VL; |
288 | 0 | } |
289 | 0 | if (ecx & cpuid_bit::ecx::avx512vbmi2) { |
290 | 0 | host_isa |= instruction_set::AVX512VBMI2; |
291 | 0 | } |
292 | 0 | if (ecx & cpuid_bit::ecx::avx512vpopcnt) { |
293 | 0 | host_isa |= instruction_set::AVX512VPOPCNTDQ; |
294 | 0 | } |
295 | 0 | return host_isa; |
296 | 0 | } |
297 | | #elif defined(__loongarch__) |
298 | | |
299 | | static inline uint32_t detect_supported_architectures() { |
300 | | uint32_t host_isa = instruction_set::DEFAULT; |
301 | | #if defined(__linux__) |
302 | | uint64_t hwcap = 0; |
303 | | hwcap = getauxval(AT_HWCAP); |
304 | | if (hwcap & HWCAP_LOONGARCH_LSX) { |
305 | | host_isa |= instruction_set::LSX; |
306 | | } |
307 | | if (hwcap & HWCAP_LOONGARCH_LASX) { |
308 | | host_isa |= instruction_set::LASX; |
309 | | } |
310 | | #endif |
311 | | return host_isa; |
312 | | } |
313 | | #else // fallback |
314 | | |
315 | | // includes 32-bit ARM. |
316 | | static inline uint32_t detect_supported_architectures() { |
317 | | return instruction_set::DEFAULT; |
318 | | } |
319 | | |
320 | | #endif // end SIMD extension detection code |
321 | | |
322 | | } // namespace internal |
323 | | } // namespace simdutf |
324 | | |
325 | | #endif // SIMDutf_INTERNAL_ISADETECTION_H |