Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * cpu.c: cpu detection |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | | * Laurent Aimar <fenrir@via.ecp.fr> |
8 | | * Fiona Glaser <fiona@x264.com> |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License as published by |
12 | | * the Free Software Foundation; either version 2 of the License, or |
13 | | * (at your option) any later version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | | * GNU General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License |
21 | | * along with this program; if not, write to the Free Software |
22 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
23 | | * |
24 | | * This program is also available under a commercial proprietary license. |
25 | | * For more information, contact us at licensing@x264.com. |
26 | | *****************************************************************************/ |
27 | | |
28 | | #include "base.h" |
29 | | |
30 | | #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO |
31 | | #include <sys/auxv.h> |
32 | | #endif |
33 | | #if HAVE_SYSCONF |
34 | | #include <unistd.h> |
35 | | #endif |
36 | | #if SYS_LINUX |
37 | | #include <sched.h> |
38 | | #endif |
39 | | #if SYS_BEOS |
40 | | #include <kernel/OS.h> |
41 | | #endif |
42 | | #if SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD |
43 | | #include <sys/types.h> |
44 | | #include <sys/sysctl.h> |
45 | | #endif |
46 | | #if SYS_OPENBSD |
47 | | #include <machine/cpu.h> |
48 | | #endif |
49 | | |
50 | | const x264_cpu_name_t x264_cpu_names[] = |
51 | | { |
52 | | #if ARCH_X86 || ARCH_X86_64 |
53 | | // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore |
54 | | #define MMX2 X264_CPU_MMX|X264_CPU_MMX2 |
55 | | {"MMX2", MMX2}, |
56 | | {"MMXEXT", MMX2}, |
57 | | {"SSE", MMX2|X264_CPU_SSE}, |
58 | | #define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2 |
59 | | {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, |
60 | | {"SSE2", SSE2}, |
61 | | {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, |
62 | | {"LZCNT", SSE2|X264_CPU_LZCNT}, |
63 | | {"SSE3", SSE2|X264_CPU_SSE3}, |
64 | | {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, |
65 | | {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, |
66 | | {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, |
67 | | {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, |
68 | | #define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX |
69 | | {"AVX", AVX}, |
70 | | {"XOP", AVX|X264_CPU_XOP}, |
71 | | {"FMA4", AVX|X264_CPU_FMA4}, |
72 | | {"FMA3", AVX|X264_CPU_FMA3}, |
73 | | {"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1}, |
74 | | {"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2}, |
75 | | #define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2 |
76 | | {"AVX2", AVX2}, |
77 | | {"AVX512", AVX2|X264_CPU_AVX512}, |
78 | | #undef AVX2 |
79 | | #undef AVX |
80 | | #undef SSE2 |
81 | | #undef MMX2 |
82 | | {"Cache32", X264_CPU_CACHELINE_32}, |
83 | | {"Cache64", X264_CPU_CACHELINE_64}, |
84 | | {"SlowAtom", X264_CPU_SLOW_ATOM}, |
85 | | {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, |
86 | | {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, |
87 | | {"SlowShuffle", X264_CPU_SLOW_SHUFFLE}, |
88 | | {"UnalignedStack", X264_CPU_STACK_MOD4}, |
89 | | #elif ARCH_PPC |
90 | | {"Altivec", X264_CPU_ALTIVEC}, |
91 | | #elif ARCH_ARM |
92 | | {"ARMv6", X264_CPU_ARMV6}, |
93 | | {"NEON", X264_CPU_NEON}, |
94 | | {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, |
95 | | #elif ARCH_AARCH64 |
96 | | {"ARMv8", X264_CPU_ARMV8}, |
97 | | {"NEON", X264_CPU_NEON}, |
98 | | {"DotProd", X264_CPU_DOTPROD}, |
99 | | {"I8MM", X264_CPU_I8MM}, |
100 | | {"SVE", X264_CPU_SVE}, |
101 | | {"SVE2", X264_CPU_SVE2}, |
102 | | #elif ARCH_RISCV64 |
103 | | {"RVV", X264_CPU_RVV}, |
104 | | #elif ARCH_MIPS |
105 | | {"MSA", X264_CPU_MSA}, |
106 | | #elif ARCH_LOONGARCH |
107 | | {"LSX", X264_CPU_LSX}, |
108 | | {"LASX", X264_CPU_LASX}, |
109 | | #endif |
110 | | {"", 0}, |
111 | | }; |
112 | | |
113 | | static unsigned long x264_getauxval( unsigned long type ) |
114 | 0 | { |
115 | 0 | #if HAVE_GETAUXVAL |
116 | 0 | return getauxval( type ); |
117 | 0 | #elif HAVE_ELF_AUX_INFO |
118 | 0 | unsigned long aux = 0; |
119 | 0 | elf_aux_info( type, &aux, sizeof(aux) ); |
120 | 0 | return aux; |
121 | 0 | #else |
122 | 0 | return 0; |
123 | 0 | #endif |
124 | 0 | } |
125 | | |
126 | | #if ((HAVE_ALTIVEC && SYS_LINUX) || (HAVE_ARMV6 && !HAVE_NEON)) && !(HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO) |
127 | | #include <signal.h> |
128 | | #include <setjmp.h> |
129 | | static sigjmp_buf jmpbuf; |
130 | | static volatile sig_atomic_t canjump = 0; |
131 | | |
132 | | static void sigill_handler( int sig ) |
133 | | { |
134 | | if( !canjump ) |
135 | | { |
136 | | signal( sig, SIG_DFL ); |
137 | | raise( sig ); |
138 | | } |
139 | | |
140 | | canjump = 0; |
141 | | siglongjmp( jmpbuf, 1 ); |
142 | | } |
143 | | #endif |
144 | | |
145 | | #if HAVE_MMX |
146 | | int x264_cpu_cpuid_test( void ); |
147 | | void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); |
148 | | uint64_t x264_cpu_xgetbv( int xcr ); |
149 | | |
150 | | uint32_t x264_cpu_detect( void ) |
151 | | { |
152 | | uint32_t cpu = 0; |
153 | | uint32_t eax, ebx, ecx, edx; |
154 | | uint32_t vendor[4] = {0}; |
155 | | uint32_t max_extended_cap, max_basic_cap; |
156 | | |
157 | | #if !ARCH_X86_64 |
158 | | if( !x264_cpu_cpuid_test() ) |
159 | | return 0; |
160 | | #endif |
161 | | |
162 | | x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 ); |
163 | | if( max_basic_cap == 0 ) |
164 | | return 0; |
165 | | |
166 | | x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); |
167 | | if( edx&0x00800000 ) |
168 | | cpu |= X264_CPU_MMX; |
169 | | else |
170 | | return cpu; |
171 | | if( edx&0x02000000 ) |
172 | | cpu |= X264_CPU_MMX2|X264_CPU_SSE; |
173 | | if( edx&0x04000000 ) |
174 | | cpu |= X264_CPU_SSE2; |
175 | | if( ecx&0x00000001 ) |
176 | | cpu |= X264_CPU_SSE3; |
177 | | if( ecx&0x00000200 ) |
178 | | cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST; |
179 | | if( ecx&0x00080000 ) |
180 | | cpu |= X264_CPU_SSE4; |
181 | | if( ecx&0x00100000 ) |
182 | | cpu |= X264_CPU_SSE42; |
183 | | |
184 | | if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */ |
185 | | { |
186 | | uint64_t xcr0 = x264_cpu_xgetbv( 0 ); |
187 | | if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */ |
188 | | { |
189 | | if( ecx&0x10000000 ) |
190 | | cpu |= X264_CPU_AVX; |
191 | | if( ecx&0x00001000 ) |
192 | | cpu |= X264_CPU_FMA3; |
193 | | |
194 | | if( max_basic_cap >= 7 ) |
195 | | { |
196 | | x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); |
197 | | if( ebx&0x00000008 ) |
198 | | cpu |= X264_CPU_BMI1; |
199 | | if( ebx&0x00000100 ) |
200 | | cpu |= X264_CPU_BMI2; |
201 | | if( ebx&0x00000020 ) |
202 | | cpu |= X264_CPU_AVX2; |
203 | | |
204 | | if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */ |
205 | | { |
206 | | if( (ebx&0xD0030000) == 0xD0030000 ) |
207 | | cpu |= X264_CPU_AVX512; |
208 | | } |
209 | | } |
210 | | } |
211 | | } |
212 | | |
213 | | x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); |
214 | | max_extended_cap = eax; |
215 | | |
216 | | if( max_extended_cap >= 0x80000001 ) |
217 | | { |
218 | | x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); |
219 | | |
220 | | if( ecx&0x00000020 ) |
221 | | cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ |
222 | | if( ecx&0x00000040 ) /* SSE4a, AMD only */ |
223 | | { |
224 | | int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); |
225 | | cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ |
226 | | if( family == 0x14 ) |
227 | | { |
228 | | cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ |
229 | | cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ |
230 | | cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ |
231 | | } |
232 | | if( family == 0x16 ) |
233 | | { |
234 | | cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough |
235 | | * compared to alternate instruction sequences that this |
236 | | * is equal or faster on almost all such functions. */ |
237 | | } |
238 | | } |
239 | | |
240 | | if( cpu & X264_CPU_AVX ) |
241 | | { |
242 | | if( ecx&0x00000800 ) /* XOP */ |
243 | | cpu |= X264_CPU_XOP; |
244 | | if( ecx&0x00010000 ) /* FMA4 */ |
245 | | cpu |= X264_CPU_FMA4; |
246 | | } |
247 | | |
248 | | if( !strcmp((char*)vendor, "AuthenticAMD") ) |
249 | | { |
250 | | if( edx&0x00400000 ) |
251 | | cpu |= X264_CPU_MMX2; |
252 | | if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) |
253 | | cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ |
254 | | } |
255 | | } |
256 | | |
257 | | if( !strcmp((char*)vendor, "GenuineIntel") ) |
258 | | { |
259 | | x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); |
260 | | int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); |
261 | | int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); |
262 | | if( family == 6 ) |
263 | | { |
264 | | /* Detect Atom CPU */ |
265 | | if( model == 28 ) |
266 | | { |
267 | | cpu |= X264_CPU_SLOW_ATOM; |
268 | | cpu |= X264_CPU_SLOW_PSHUFB; |
269 | | } |
270 | | /* Conroe has a slow shuffle unit. Check the model number to make sure not |
271 | | * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ |
272 | | else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 ) |
273 | | cpu |= X264_CPU_SLOW_SHUFFLE; |
274 | | } |
275 | | } |
276 | | |
277 | | if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42)) |
278 | | { |
279 | | /* cacheline size is specified in 3 places, any of which may be missing */ |
280 | | x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); |
281 | | int cache = (ebx&0xff00)>>5; // cflush size |
282 | | if( !cache && max_extended_cap >= 0x80000006 ) |
283 | | { |
284 | | x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); |
285 | | cache = ecx&0xff; // cacheline size |
286 | | } |
287 | | if( !cache && max_basic_cap >= 2 ) |
288 | | { |
289 | | // Cache and TLB Information |
290 | | static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; |
291 | | static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, |
292 | | 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; |
293 | | uint32_t buf[4]; |
294 | | int max, i = 0; |
295 | | do { |
296 | | x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 ); |
297 | | max = buf[0]&0xff; |
298 | | buf[0] &= ~0xff; |
299 | | for( int j = 0; j < 4; j++ ) |
300 | | if( !(buf[j]>>31) ) |
301 | | while( buf[j] ) |
302 | | { |
303 | | if( strchr( cache32_ids, buf[j]&0xff ) ) |
304 | | cache = 32; |
305 | | if( strchr( cache64_ids, buf[j]&0xff ) ) |
306 | | cache = 64; |
307 | | buf[j] >>= 8; |
308 | | } |
309 | | } while( ++i < max ); |
310 | | } |
311 | | |
312 | | if( cache == 32 ) |
313 | | cpu |= X264_CPU_CACHELINE_32; |
314 | | else if( cache == 64 ) |
315 | | cpu |= X264_CPU_CACHELINE_64; |
316 | | else |
317 | | x264_log_internal( X264_LOG_WARNING, "unable to determine cacheline size\n" ); |
318 | | } |
319 | | |
320 | | #if STACK_ALIGNMENT < 16 |
321 | | cpu |= X264_CPU_STACK_MOD4; |
322 | | #endif |
323 | | |
324 | | return cpu; |
325 | | } |
326 | | |
327 | | #elif HAVE_ALTIVEC |
328 | | |
329 | | #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO |
330 | | |
331 | | #define HWCAP_PPC_ALTIVEC (1U << 28) |
332 | | |
333 | | uint32_t x264_cpu_detect( void ) |
334 | | { |
335 | | uint32_t flags = 0; |
336 | | |
337 | | unsigned long hwcap = x264_getauxval( AT_HWCAP ); |
338 | | |
339 | | if ( hwcap & HWCAP_PPC_ALTIVEC ) |
340 | | flags |= X264_CPU_ALTIVEC; |
341 | | |
342 | | return flags; |
343 | | } |
344 | | |
345 | | #elif SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD |
346 | | |
347 | | uint32_t x264_cpu_detect( void ) |
348 | | { |
349 | | /* Thank you VLC */ |
350 | | uint32_t cpu = 0; |
351 | | #if SYS_OPENBSD |
352 | | int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC }; |
353 | | #elif SYS_MACOSX |
354 | | int selectors[2] = { CTL_HW, HW_VECTORUNIT }; |
355 | | #endif |
356 | | int has_altivec = 0; |
357 | | size_t length = sizeof( has_altivec ); |
358 | | #if SYS_MACOSX || SYS_OPENBSD |
359 | | int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 ); |
360 | | #elif SYS_NETBSD |
361 | | int error = sysctlbyname( "machdep.altivec", &has_altivec, &length, NULL, 0 ); |
362 | | #else |
363 | | int error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 ); |
364 | | #endif |
365 | | |
366 | | if( error == 0 && has_altivec != 0 ) |
367 | | cpu |= X264_CPU_ALTIVEC; |
368 | | |
369 | | return cpu; |
370 | | } |
371 | | |
372 | | #elif SYS_LINUX |
373 | | |
374 | | uint32_t x264_cpu_detect( void ) |
375 | | { |
376 | | #ifdef __NO_FPRS__ |
377 | | return 0; |
378 | | #else |
379 | | static void (*oldsig)( int ); |
380 | | |
381 | | oldsig = signal( SIGILL, sigill_handler ); |
382 | | if( sigsetjmp( jmpbuf, 1 ) ) |
383 | | { |
384 | | signal( SIGILL, oldsig ); |
385 | | return 0; |
386 | | } |
387 | | |
388 | | canjump = 1; |
389 | | asm volatile( "mtspr 256, %0\n\t" |
390 | | "vand 0, 0, 0\n\t" |
391 | | : |
392 | | : "r"(-1) ); |
393 | | canjump = 0; |
394 | | |
395 | | signal( SIGILL, oldsig ); |
396 | | |
397 | | return X264_CPU_ALTIVEC; |
398 | | #endif |
399 | | } |
400 | | |
401 | | #else |
402 | | |
403 | | uint32_t x264_cpu_detect( void ) |
404 | | { |
405 | | return 0; |
406 | | } |
407 | | |
408 | | #endif |
409 | | |
410 | | #elif HAVE_ARMV6 |
411 | | |
412 | | void x264_cpu_neon_test( void ); |
413 | | int x264_cpu_fast_neon_mrc_test( void ); |
414 | | |
415 | | #define HWCAP_ARM_NEON (1U << 12) |
416 | | |
417 | | uint32_t x264_cpu_detect( void ) |
418 | | { |
419 | | uint32_t flags = 0; |
420 | | flags |= X264_CPU_ARMV6; |
421 | | |
422 | | #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO |
423 | | unsigned long hwcap = x264_getauxval( AT_HWCAP ); |
424 | | |
425 | | if ( hwcap & HWCAP_ARM_NEON ) |
426 | | flags |= X264_CPU_NEON; |
427 | | #else |
428 | | // don't do this hack if compiled with -mfpu=neon |
429 | | #if !HAVE_NEON |
430 | | static void (* oldsig)( int ); |
431 | | oldsig = signal( SIGILL, sigill_handler ); |
432 | | if( sigsetjmp( jmpbuf, 1 ) ) |
433 | | { |
434 | | signal( SIGILL, oldsig ); |
435 | | return flags; |
436 | | } |
437 | | |
438 | | canjump = 1; |
439 | | x264_cpu_neon_test(); |
440 | | canjump = 0; |
441 | | signal( SIGILL, oldsig ); |
442 | | #endif |
443 | | |
444 | | flags |= X264_CPU_NEON; |
445 | | #endif |
446 | | |
447 | | // fast neon -> arm (Cortex-A9) detection relies on user access to the |
448 | | // cycle counter; this assumes ARMv7 performance counters. |
449 | | // NEON requires at least ARMv7, ARMv8 may require changes here, but |
450 | | // hopefully this hacky detection method will have been replaced by then. |
451 | | // Note that there is potential for a race condition if another program or |
452 | | // x264 instance disables or reinits the counters while x264 is using them, |
453 | | // which may result in incorrect detection and the counters stuck enabled. |
454 | | // right now Apple does not seem to support performance counters for this test |
455 | | // Don't test this on Windows; performance counters are readable, but |
456 | | // the PMNC is not readable. |
457 | | #if !defined(__MACH__) && !defined(_WIN32) |
458 | | flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0; |
459 | | #endif |
460 | | // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) |
461 | | return flags; |
462 | | } |
463 | | |
464 | | #elif HAVE_RISCV64 |
465 | | |
466 | | #define HWCAP_RISCV64_RVV (1 << ('V' - 'A')) |
467 | | |
468 | | uint32_t x264_cpu_detect( void ) |
469 | | { |
470 | | uint32_t flags = 0; |
471 | | |
472 | | #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO |
473 | | unsigned long hwcap = x264_getauxval( AT_HWCAP ); |
474 | | |
475 | | if ( hwcap & HWCAP_RISCV64_RVV ) |
476 | | flags |= X264_CPU_RVV; |
477 | | #else |
478 | | #if HAVE_RVV |
479 | | flags |= X264_CPU_RVV; |
480 | | #endif |
481 | | #endif |
482 | | |
483 | | return flags; |
484 | | } |
485 | | |
486 | | #elif HAVE_AARCH64 |
487 | | |
488 | | #if defined(__linux__) || HAVE_ELF_AUX_INFO |
489 | | |
490 | | #define HWCAP_AARCH64_ASIMDDP (1U << 20) |
491 | | #define HWCAP_AARCH64_SVE (1U << 22) |
492 | | #define HWCAP2_AARCH64_SVE2 (1U << 1) |
493 | | #define HWCAP2_AARCH64_I8MM (1U << 13) |
494 | | |
495 | | static uint32_t detect_flags( void ) |
496 | | { |
497 | | uint32_t flags = 0; |
498 | | |
499 | | unsigned long hwcap = x264_getauxval( AT_HWCAP ); |
500 | | unsigned long hwcap2 = x264_getauxval( AT_HWCAP2 ); |
501 | | |
502 | | if ( hwcap & HWCAP_AARCH64_ASIMDDP ) |
503 | | flags |= X264_CPU_DOTPROD; |
504 | | if ( hwcap2 & HWCAP2_AARCH64_I8MM ) |
505 | | flags |= X264_CPU_I8MM; |
506 | | if ( hwcap & HWCAP_AARCH64_SVE ) |
507 | | flags |= X264_CPU_SVE; |
508 | | if ( hwcap2 & HWCAP2_AARCH64_SVE2 ) |
509 | | flags |= X264_CPU_SVE2; |
510 | | |
511 | | return flags; |
512 | | } |
513 | | |
514 | | #elif defined(__APPLE__) |
515 | | #include <sys/sysctl.h> |
516 | | |
517 | | static int have_feature( const char *feature ) |
518 | | { |
519 | | int supported = 0; |
520 | | size_t size = sizeof(supported); |
521 | | if ( sysctlbyname( feature, &supported, &size, NULL, 0 ) ) |
522 | | return 0; |
523 | | return supported; |
524 | | } |
525 | | |
526 | | static uint32_t detect_flags( void ) |
527 | | { |
528 | | uint32_t flags = 0; |
529 | | |
530 | | if ( have_feature( "hw.optional.arm.FEAT_DotProd" ) ) |
531 | | flags |= X264_CPU_DOTPROD; |
532 | | if ( have_feature( "hw.optional.arm.FEAT_I8MM" ) ) |
533 | | flags |= X264_CPU_I8MM; |
534 | | /* No SVE and SVE2 feature detection available on Apple platforms. */ |
535 | | return flags; |
536 | | } |
537 | | |
538 | | #elif defined(_WIN32) |
539 | | #include <windows.h> |
540 | | |
541 | | static uint32_t detect_flags( void ) |
542 | | { |
543 | | uint32_t flags = 0; |
544 | | |
545 | | #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE |
546 | | if ( IsProcessorFeaturePresent( PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE ) ) |
547 | | flags |= X264_CPU_DOTPROD; |
548 | | #endif |
549 | | #ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE |
550 | | if ( IsProcessorFeaturePresent( PF_ARM_SVE_INSTRUCTIONS_AVAILABLE ) ) |
551 | | flags |= X264_CPU_SVE; |
552 | | #endif |
553 | | #ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE |
554 | | if ( IsProcessorFeaturePresent( PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE ) ) |
555 | | flags |= X264_CPU_SVE2; |
556 | | #endif |
557 | | #ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE |
558 | | /* There's no PF_* flag that indicates whether plain I8MM is available |
559 | | * or not. But if SVE_I8MM is available, that also implies that |
560 | | * regular I8MM is available. */ |
561 | | if ( IsProcessorFeaturePresent( PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE ) ) |
562 | | flags |= X264_CPU_I8MM; |
563 | | #endif |
564 | | return flags; |
565 | | } |
566 | | |
567 | | #endif |
568 | | |
569 | | uint32_t x264_cpu_detect( void ) |
570 | | { |
571 | | uint32_t flags = X264_CPU_ARMV8; |
572 | | #if HAVE_NEON |
573 | | flags |= X264_CPU_NEON; |
574 | | #endif |
575 | | |
576 | | // If these features are enabled unconditionally in the compiler, we can |
577 | | // assume that they are available. |
578 | | #ifdef __ARM_FEATURE_DOTPROD |
579 | | flags |= X264_CPU_DOTPROD; |
580 | | #endif |
581 | | #ifdef __ARM_FEATURE_MATMUL_INT8 |
582 | | flags |= X264_CPU_I8MM; |
583 | | #endif |
584 | | #ifdef __ARM_FEATURE_SVE |
585 | | flags |= X264_CPU_SVE; |
586 | | #endif |
587 | | #ifdef __ARM_FEATURE_SVE2 |
588 | | flags |= X264_CPU_SVE2; |
589 | | #endif |
590 | | |
591 | | // Where possible, try to do runtime detection as well. |
592 | | #if defined(__linux__) || HAVE_ELF_AUX_INFO || \ |
593 | | defined(__APPLE__) || defined(_WIN32) |
594 | | flags |= detect_flags(); |
595 | | #endif |
596 | | |
597 | | return flags; |
598 | | } |
599 | | |
600 | | #elif HAVE_MSA |
601 | | |
602 | | uint32_t x264_cpu_detect( void ) |
603 | | { |
604 | | return X264_CPU_MSA; |
605 | | } |
606 | | |
607 | | #elif HAVE_LSX |
608 | | |
609 | | #define LA_HWCAP_LSX ( 1U << 4 ) |
610 | | #define LA_HWCAP_LASX ( 1U << 5 ) |
611 | | |
612 | | uint32_t x264_cpu_detect( void ) |
613 | | { |
614 | | uint32_t flags = 0; |
615 | | uint32_t hwcap = (uint32_t)x264_getauxval( AT_HWCAP ); |
616 | | |
617 | | if( hwcap & LA_HWCAP_LSX ) |
618 | | flags |= X264_CPU_LSX; |
619 | | if( hwcap & LA_HWCAP_LASX ) |
620 | | flags |= X264_CPU_LASX; |
621 | | |
622 | | return flags; |
623 | | } |
624 | | |
625 | | #else |
626 | | |
627 | | uint32_t x264_cpu_detect( void ) |
628 | 0 | { |
629 | 0 | return 0; |
630 | 0 | } |
631 | | |
632 | | #endif |
633 | | |
634 | | int x264_cpu_num_processors( void ) |
635 | 0 | { |
636 | | #if !HAVE_THREAD |
637 | | return 1; |
638 | | |
639 | | #elif SYS_WINDOWS |
640 | | return x264_pthread_num_processors_np(); |
641 | | |
642 | | #elif SYS_LINUX |
643 | | cpu_set_t p_aff; |
644 | 0 | memset( &p_aff, 0, sizeof(p_aff) ); |
645 | 0 | if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) ) |
646 | 0 | return 1; |
647 | | #if HAVE_CPU_COUNT |
648 | | return CPU_COUNT(&p_aff); |
649 | | #else |
650 | 0 | int np = 0; |
651 | 0 | for( size_t bit = 0; bit < 8 * sizeof(p_aff); bit++ ) |
652 | 0 | np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1; |
653 | 0 | return np; |
654 | 0 | #endif |
655 | |
|
656 | | #elif SYS_BEOS |
657 | | system_info info; |
658 | | get_system_info( &info ); |
659 | | return info.cpu_count; |
660 | | |
661 | | #elif SYS_MACOSX |
662 | | int ncpu; |
663 | | size_t length = sizeof( ncpu ); |
664 | | if( sysctlbyname("hw.logicalcpu", &ncpu, &length, NULL, 0) ) |
665 | | { |
666 | | ncpu = 1; |
667 | | } |
668 | | return ncpu; |
669 | | |
670 | | #elif defined(_SC_NPROCESSORS_ONLN) |
671 | | return sysconf( _SC_NPROCESSORS_ONLN ); |
672 | | |
673 | | #elif defined(_SC_NPROCESSORS_CONF) |
674 | | return sysconf( _SC_NPROCESSORS_CONF ); |
675 | | |
676 | | #else |
677 | | return 1; |
678 | | #endif |
679 | 0 | } |