Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/common/cpu.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Loren Merritt <lorenm@u.washington.edu>
5
 *          Laurent Aimar <fenrir@via.ecp.fr>
6
 *          Fiona Glaser <fiona@x264.com>
7
 *          Steve Borho <steve@borho.org>
8
 *          Hongbin Liu <liuhongbin1@huawei.com>
9
 *          Yimeng Su <yimeng.su@huawei.com>
10
 *          Josh Dekker <josh@itanimul.li>
11
 *          Jean-Baptiste Kempf <jb@videolan.org>
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26
 *
27
 * This program is also available under a commercial proprietary license.
28
 * For more information, contact us at license @ x265.com.
29
 *****************************************************************************/
30
31
#include "cpu.h"
32
#include "common.h"
33
34
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
35
#include <sys/auxv.h>
36
#endif
37
#if MACOS || SYS_FREEBSD
38
#include <sys/types.h>
39
#include <sys/sysctl.h>
40
#endif
41
#if SYS_OPENBSD
42
#include <sys/param.h>
43
#include <sys/sysctl.h>
44
#include <machine/cpu.h>
45
#endif
46
47
#if X265_ARCH_ARM && !defined(HAVE_NEON) && !(HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO)
48
#include <signal.h>
49
#include <setjmp.h>
50
static sigjmp_buf jmpbuf;
51
static volatile sig_atomic_t canjump = 0;
52
53
static void sigill_handler(int sig)
54
{
55
    if (!canjump)
56
    {
57
        signal(sig, SIG_DFL);
58
        raise(sig);
59
    }
60
61
    canjump = 0;
62
    siglongjmp(jmpbuf, 1);
63
}
64
65
#endif // if X265_ARCH_ARM
66
67
namespace X265_NS {
68
#if X265_ARCH_X86
69
static bool enable512 = false;
70
#endif
71
const cpu_name_t cpu_names[] =
72
{
73
#if X265_ARCH_X86
74
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
75
    { "MMX2",        MMX2 },
76
    { "MMXEXT",      MMX2 },
77
    { "SSE",         MMX2 | X265_CPU_SSE },
78
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
79
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
80
    { "SSE2",        SSE2 },
81
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
82
    { "LZCNT", X265_CPU_LZCNT },
83
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
84
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
85
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
86
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
87
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
88
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
89
    { "AVX",         AVX },
90
    { "XOP",         AVX | X265_CPU_XOP },
91
    { "FMA4",        AVX | X265_CPU_FMA4 },
92
    { "FMA3",        AVX | X265_CPU_FMA3 },
93
    { "BMI1",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 },
94
    { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
95
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
96
    { "AVX2", AVX2},
97
    { "AVX512", AVX2 | X265_CPU_AVX512 },
98
#undef AVX2
99
#undef AVX
100
#undef SSE2
101
#undef MMX2
102
    { "Cache32",         X265_CPU_CACHELINE_32 },
103
    { "Cache64",         X265_CPU_CACHELINE_64 },
104
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
105
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
106
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
107
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
108
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },
109
110
#elif X265_ARCH_ARM
111
    { "ARMv6",           X265_CPU_ARMV6 },
112
    { "NEON",            X265_CPU_NEON },
113
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
114
115
#elif X265_ARCH_ARM64
116
    { "NEON",            X265_CPU_NEON },
117
#if defined(HAVE_SVE)
118
    { "SVE",            X265_CPU_SVE },
119
#endif
120
#if defined(HAVE_SVE2)
121
    { "SVE2",            X265_CPU_SVE2 },
122
#endif
123
#if defined(HAVE_NEON_DOTPROD)
124
    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
125
#endif
126
#if defined(HAVE_NEON_I8MM)
127
    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
128
#endif
129
#if defined(HAVE_SVE2_BITPERM)
130
    { "SVE2_BitPerm",    X265_CPU_SVE2_BITPERM },
131
#endif
132
#elif X265_ARCH_POWER8
133
    { "Altivec",         X265_CPU_ALTIVEC },
134
135
#elif X265_ARCH_RISCV64
136
    { "RVV",           X265_CPU_RVV },
137
    { "Zbb",           X265_CPU_ZBB },
138
139
#endif // if X265_ARCH_X86
140
    { "", 0 },
141
};
142
143
unsigned long x265_getauxval(unsigned long type)
144
0
{
145
0
#if HAVE_GETAUXVAL
146
0
    return getauxval(type);
147
#elif HAVE_ELF_AUX_INFO
148
    unsigned long aux = 0;
149
    elf_aux_info(type, &aux, sizeof(aux));
150
    return aux;
151
#else
152
    (void)type;
153
    return 0;
154
#endif
155
0
}
156
157
#if X265_ARCH_X86
158
159
extern "C" {
160
/* cpu-a.asm */
161
int PFX(cpu_cpuid_test)(void);
162
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
163
uint64_t PFX(cpu_xgetbv)(int xcr);
164
}
165
166
#if defined(_MSC_VER)
167
#pragma warning(disable: 4309) // truncation of constant value
168
#endif
169
170
bool detect512()
171
0
{
172
0
    return(enable512);
173
0
}
174
175
uint32_t cpu_detect(bool benableavx512 )
176
0
{
177
178
0
    uint32_t cpu = 0; 
179
0
    uint32_t eax, ebx, ecx, edx;
180
0
    uint32_t vendor[4] = { 0 };
181
0
    uint32_t max_extended_cap, max_basic_cap;
182
0
    uint64_t xcr0 = 0;
183
184
#if !X86_64
185
    if (!PFX(cpu_cpuid_test)())
186
        return 0;
187
#endif
188
189
0
    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
190
0
    if (max_basic_cap == 0)
191
0
        return 0;
192
193
0
    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
194
0
    if (edx & 0x00800000)
195
0
        cpu |= X265_CPU_MMX;
196
0
    else
197
0
        return cpu;
198
0
    if (edx & 0x02000000)
199
0
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
200
0
    if (edx & 0x04000000)
201
0
        cpu |= X265_CPU_SSE2;
202
0
    if (ecx & 0x00000001)
203
0
        cpu |= X265_CPU_SSE3;
204
0
    if (ecx & 0x00000200)
205
0
        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
206
0
    if (ecx & 0x00080000)
207
0
        cpu |= X265_CPU_SSE4;
208
0
    if (ecx & 0x00100000)
209
0
        cpu |= X265_CPU_SSE42;
210
211
0
    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
212
0
    {
213
        /* Check for OS support */
214
0
        xcr0 = PFX(cpu_xgetbv)(0);
215
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
216
0
        {
217
0
            if (ecx & 0x10000000)
218
0
            cpu |= X265_CPU_AVX;
219
0
            if (ecx & 0x00001000)
220
0
                cpu |= X265_CPU_FMA3;
221
0
        }
222
0
    }
223
224
0
    if (max_basic_cap >= 7)
225
0
    {
226
0
        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
227
        /* AVX2 requires OS support, but BMI1/2 don't. */
228
0
        if (ebx & 0x00000008)
229
0
            cpu |= X265_CPU_BMI1;
230
0
        if (ebx & 0x00000100)
231
0
            cpu |= X265_CPU_BMI2;
232
233
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
234
0
        {
235
0
            if (ebx & 0x00000020)
236
0
                cpu |= X265_CPU_AVX2;
237
0
            if (benableavx512)
238
0
            {
239
0
                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
240
0
                {
241
0
                    if ((ebx & 0xD0030000) == 0xD0030000)
242
0
                    {
243
0
                        cpu |= X265_CPU_AVX512;
244
0
                        enable512 = true;
245
0
                    }
246
0
                }
247
0
            }
248
0
        }
249
0
    }
250
251
0
    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
252
0
    max_extended_cap = eax;
253
254
0
    if (max_extended_cap >= 0x80000001)
255
0
    {
256
0
        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
257
258
0
        if (ecx & 0x00000020)
259
0
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
260
0
        if (ecx & 0x00000040) /* SSE4a, AMD only */
261
0
        {
262
0
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
263
0
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
264
0
            if (family == 0x14)
265
0
            {
266
0
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
267
0
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
268
0
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
269
0
            }
270
0
            if (family == 0x16)
271
0
            {
272
0
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
273
                                                * compared to alternate instruction sequences that this
274
                                                * is equal or faster on almost all such functions. */
275
0
            }
276
0
        }
277
278
0
        if (cpu & X265_CPU_AVX)
279
0
        {
280
0
            if (ecx & 0x00000800) /* XOP */
281
0
                cpu |= X265_CPU_XOP;
282
0
            if (ecx & 0x00010000) /* FMA4 */
283
0
                cpu |= X265_CPU_FMA4;
284
0
        }
285
286
0
        if (!strcmp((char*)vendor, "AuthenticAMD"))
287
0
        {
288
0
            if (edx & 0x00400000)
289
0
                cpu |= X265_CPU_MMX2;
290
0
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
291
0
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
292
0
        }
293
0
    }
294
295
0
    if (!strcmp((char*)vendor, "GenuineIntel"))
296
0
    {
297
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
298
0
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
299
0
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
300
0
        if (family == 6)
301
0
        {
302
            /* Detect Atom CPU */
303
0
            if (model == 28)
304
0
            {
305
0
                cpu |= X265_CPU_SLOW_ATOM;
306
0
                cpu |= X265_CPU_SLOW_PSHUFB;
307
0
            }
308
309
            /* Conroe has a slow shuffle unit. Check the model number to make sure not
310
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
311
0
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
312
0
                cpu |= X265_CPU_SLOW_SHUFFLE;
313
0
        }
314
0
    }
315
316
0
    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
317
0
    {
318
        /* cacheline size is specified in 3 places, any of which may be missing */
319
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
320
0
        int cache = (ebx & 0xff00) >> 5; // cflush size
321
0
        if (!cache && max_extended_cap >= 0x80000006)
322
0
        {
323
0
            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
324
0
            cache = ecx & 0xff; // cacheline size
325
0
        }
326
0
        if (!cache && max_basic_cap >= 2)
327
0
        {
328
            // Cache and TLB Information
329
0
            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
330
0
            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
331
0
                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
332
0
            uint32_t buf[4];
333
0
            int max, i = 0;
334
0
            do
335
0
            {
336
0
                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
337
0
                max = buf[0] & 0xff;
338
0
                buf[0] &= ~0xff;
339
0
                for (int j = 0; j < 4; j++)
340
0
                {
341
0
                    if (!(buf[j] >> 31))
342
0
                        while (buf[j])
343
0
                        {
344
0
                            if (strchr(cache32_ids, buf[j] & 0xff))
345
0
                                cache = 32;
346
0
                            if (strchr(cache64_ids, buf[j] & 0xff))
347
0
                                cache = 64;
348
0
                            buf[j] >>= 8;
349
0
                        }
350
0
                }
351
0
            }
352
0
            while (++i < max);
353
0
        }
354
355
0
        if (cache == 32)
356
0
            cpu |= X265_CPU_CACHELINE_32;
357
0
        else if (cache == 64)
358
0
            cpu |= X265_CPU_CACHELINE_64;
359
0
        else
360
0
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
361
0
    }
362
363
#if BROKEN_STACK_ALIGNMENT
364
    cpu |= X265_CPU_STACK_MOD4;
365
#endif
366
367
0
    return cpu;
368
0
}
369
370
#elif X265_ARCH_ARM
371
372
extern "C" {
373
void PFX(cpu_neon_test)(void);
374
int PFX(cpu_fast_neon_mrc_test)(void);
375
}
376
377
#define X265_ARM_HWCAP_NEON (1U << 12)
378
379
uint32_t cpu_detect(bool benableavx512)
380
{
381
    int flags = 0;
382
383
#if HAVE_ARMV6 && ENABLE_ASSEMBLY
384
    flags |= X265_CPU_ARMV6;
385
386
#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
387
    unsigned long hwcap = x265_getauxval(AT_HWCAP);
388
389
    if (hwcap & X265_ARM_HWCAP_NEON) flags |= X265_CPU_NEON;
390
#else
391
    // don't do this hack if compiled with -mfpu=neon
392
#if !HAVE_NEON
393
    static void (* oldsig)(int);
394
    oldsig = signal(SIGILL, sigill_handler);
395
    if (sigsetjmp(jmpbuf, 1))
396
    {
397
        signal(SIGILL, oldsig);
398
        return flags;
399
    }
400
401
    canjump = 1;
402
    PFX(cpu_neon_test)();
403
    canjump = 0;
404
    signal(SIGILL, oldsig);
405
#endif // if !HAVE_NEON
406
407
    flags |= X265_CPU_NEON;
408
#endif
409
410
    // fast neon -> arm (Cortex-A9) detection relies on user access to the
411
    // cycle counter; this assumes ARMv7 performance counters.
412
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
413
    // hopefully this hacky detection method will have been replaced by then.
414
    // Note that there is potential for a race condition if another program or
415
    // x264 instance disables or reinits the counters while x264 is using them,
416
    // which may result in incorrect detection and the counters stuck enabled.
417
    // right now Apple does not seem to support performance counters for this test
418
#ifndef __MACH__
419
    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
420
#endif
421
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
422
#endif // if HAVE_ARMV6
423
    return flags;
424
}
425
426
#elif X265_ARCH_ARM64
427
#include "aarch64/cpu.h"
428
429
uint32_t cpu_detect(bool benableavx512)
430
{
431
    (void)benableavx512;
432
    int flags = 0;
433
434
#ifdef ENABLE_ASSEMBLY
435
    flags = aarch64_cpu_detect();
436
#endif
437
438
    return flags;
439
}
440
441
#elif X265_ARCH_RISCV64
442
#include "riscv64/cpu.h"
443
444
uint32_t cpu_detect(bool benableavx512)
445
{
446
    (void)benableavx512;
447
    uint32_t flags = 0;
448
449
#ifdef ENABLE_ASSEMBLY
450
    flags = riscv64_cpu_detect();
451
#endif
452
453
    return flags;
454
}
455
456
#elif X265_ARCH_POWER8
457
458
uint32_t cpu_detect(bool benableavx512)
459
{
460
#if HAVE_ALTIVEC
461
    return X265_CPU_ALTIVEC;
462
#else
463
    return 0;
464
#endif
465
}
466
467
#else // if X265_ARCH_POWER8
468
469
uint32_t cpu_detect(bool benableavx512)
470
{
471
    return 0;
472
}
473
474
#endif // if X265_ARCH_X86
475
}
476