Coverage Report

Created: 2026-01-20 07:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/x265/source/common/cpu.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Loren Merritt <lorenm@u.washington.edu>
5
 *          Laurent Aimar <fenrir@via.ecp.fr>
6
 *          Fiona Glaser <fiona@x264.com>
7
 *          Steve Borho <steve@borho.org>
8
 *          Hongbin Liu <liuhongbin1@huawei.com>
9
 *          Yimeng Su <yimeng.su@huawei.com>
10
 *          Josh Dekker <josh@itanimul.li>
11
 *          Jean-Baptiste Kempf <jb@videolan.org>
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26
 *
27
 * This program is also available under a commercial proprietary license.
28
 * For more information, contact us at license @ x265.com.
29
 *****************************************************************************/
30
31
#include "cpu.h"
32
#include "common.h"
33
34
#if MACOS || SYS_FREEBSD
35
#include <sys/types.h>
36
#include <sys/sysctl.h>
37
#endif
38
#if SYS_OPENBSD
39
#include <sys/param.h>
40
#include <sys/sysctl.h>
41
#include <machine/cpu.h>
42
#endif
43
44
#if X265_ARCH_ARM && !defined(HAVE_NEON)
45
#include <signal.h>
46
#include <setjmp.h>
47
static sigjmp_buf jmpbuf;
48
static volatile sig_atomic_t canjump = 0;
49
50
static void sigill_handler(int sig)
51
{
52
    if (!canjump)
53
    {
54
        signal(sig, SIG_DFL);
55
        raise(sig);
56
    }
57
58
    canjump = 0;
59
    siglongjmp(jmpbuf, 1);
60
}
61
62
#endif // if X265_ARCH_ARM
63
64
namespace X265_NS {
65
#if X265_ARCH_X86
66
static bool enable512 = false;
67
#endif
68
const cpu_name_t cpu_names[] =
69
{
70
#if X265_ARCH_X86
71
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
72
    { "MMX2",        MMX2 },
73
    { "MMXEXT",      MMX2 },
74
    { "SSE",         MMX2 | X265_CPU_SSE },
75
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
76
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
77
    { "SSE2",        SSE2 },
78
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
79
    { "LZCNT", X265_CPU_LZCNT },
80
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
81
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
82
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
83
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
84
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
85
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
86
    { "AVX",         AVX },
87
    { "XOP",         AVX | X265_CPU_XOP },
88
    { "FMA4",        AVX | X265_CPU_FMA4 },
89
    { "FMA3",        AVX | X265_CPU_FMA3 },
90
    { "BMI1",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 },
91
    { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
92
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
93
    { "AVX2", AVX2},
94
    { "AVX512", AVX2 | X265_CPU_AVX512 },
95
#undef AVX2
96
#undef AVX
97
#undef SSE2
98
#undef MMX2
99
    { "Cache32",         X265_CPU_CACHELINE_32 },
100
    { "Cache64",         X265_CPU_CACHELINE_64 },
101
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
102
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
103
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
104
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
105
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },
106
107
#elif X265_ARCH_ARM
108
    { "ARMv6",           X265_CPU_ARMV6 },
109
    { "NEON",            X265_CPU_NEON },
110
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
111
112
#elif X265_ARCH_ARM64
113
    { "NEON",            X265_CPU_NEON },
114
#if defined(HAVE_SVE)
115
    { "SVE",            X265_CPU_SVE },
116
#endif
117
#if defined(HAVE_SVE2)
118
    { "SVE2",            X265_CPU_SVE2 },
119
#endif
120
#if defined(HAVE_NEON_DOTPROD)
121
    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
122
#endif
123
#if defined(HAVE_NEON_I8MM)
124
    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
125
#endif
126
#if defined(HAVE_SVE2_BITPERM)
127
    { "SVE2_BitPerm",    X265_CPU_SVE2_BITPERM },
128
#endif
129
#elif X265_ARCH_POWER8
130
    { "Altivec",         X265_CPU_ALTIVEC },
131
132
#elif X265_ARCH_RISCV64
133
    { "RVV",           X265_CPU_RVV },
134
    { "Zbb",           X265_CPU_ZBB },
135
136
#endif // if X265_ARCH_X86
137
    { "", 0 },
138
};
139
140
#if X265_ARCH_X86
141
142
extern "C" {
143
/* cpu-a.asm */
144
int PFX(cpu_cpuid_test)(void);
145
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
146
uint64_t PFX(cpu_xgetbv)(int xcr);
147
}
148
149
#if defined(_MSC_VER)
150
#pragma warning(disable: 4309) // truncation of constant value
151
#endif
152
153
bool detect512()
154
0
{
155
0
    return(enable512);
156
0
}
157
158
uint32_t cpu_detect(bool benableavx512 )
159
0
{
160
161
0
    uint32_t cpu = 0; 
162
0
    uint32_t eax, ebx, ecx, edx;
163
0
    uint32_t vendor[4] = { 0 };
164
0
    uint32_t max_extended_cap, max_basic_cap;
165
0
    uint64_t xcr0 = 0;
166
167
#if !X86_64
168
    if (!PFX(cpu_cpuid_test)())
169
        return 0;
170
#endif
171
172
0
    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
173
0
    if (max_basic_cap == 0)
174
0
        return 0;
175
176
0
    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
177
0
    if (edx & 0x00800000)
178
0
        cpu |= X265_CPU_MMX;
179
0
    else
180
0
        return cpu;
181
0
    if (edx & 0x02000000)
182
0
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
183
0
    if (edx & 0x04000000)
184
0
        cpu |= X265_CPU_SSE2;
185
0
    if (ecx & 0x00000001)
186
0
        cpu |= X265_CPU_SSE3;
187
0
    if (ecx & 0x00000200)
188
0
        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
189
0
    if (ecx & 0x00080000)
190
0
        cpu |= X265_CPU_SSE4;
191
0
    if (ecx & 0x00100000)
192
0
        cpu |= X265_CPU_SSE42;
193
194
0
    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
195
0
    {
196
        /* Check for OS support */
197
0
        xcr0 = PFX(cpu_xgetbv)(0);
198
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
199
0
        {
200
0
            if (ecx & 0x10000000)
201
0
            cpu |= X265_CPU_AVX;
202
0
            if (ecx & 0x00001000)
203
0
                cpu |= X265_CPU_FMA3;
204
0
        }
205
0
    }
206
207
0
    if (max_basic_cap >= 7)
208
0
    {
209
0
        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
210
        /* AVX2 requires OS support, but BMI1/2 don't. */
211
0
        if (ebx & 0x00000008)
212
0
            cpu |= X265_CPU_BMI1;
213
0
        if (ebx & 0x00000100)
214
0
            cpu |= X265_CPU_BMI2;
215
216
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
217
0
        {
218
0
            if (ebx & 0x00000020)
219
0
                cpu |= X265_CPU_AVX2;
220
0
            if (benableavx512)
221
0
            {
222
0
                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
223
0
                {
224
0
                    if ((ebx & 0xD0030000) == 0xD0030000)
225
0
                    {
226
0
                        cpu |= X265_CPU_AVX512;
227
0
                        enable512 = true;
228
0
                    }
229
0
                }
230
0
            }
231
0
        }
232
0
    }
233
234
0
    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
235
0
    max_extended_cap = eax;
236
237
0
    if (max_extended_cap >= 0x80000001)
238
0
    {
239
0
        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
240
241
0
        if (ecx & 0x00000020)
242
0
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
243
0
        if (ecx & 0x00000040) /* SSE4a, AMD only */
244
0
        {
245
0
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
246
0
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
247
0
            if (family == 0x14)
248
0
            {
249
0
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
250
0
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
251
0
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
252
0
            }
253
0
            if (family == 0x16)
254
0
            {
255
0
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
256
                                                * compared to alternate instruction sequences that this
257
                                                * is equal or faster on almost all such functions. */
258
0
            }
259
0
        }
260
261
0
        if (cpu & X265_CPU_AVX)
262
0
        {
263
0
            if (ecx & 0x00000800) /* XOP */
264
0
                cpu |= X265_CPU_XOP;
265
0
            if (ecx & 0x00010000) /* FMA4 */
266
0
                cpu |= X265_CPU_FMA4;
267
0
        }
268
269
0
        if (!strcmp((char*)vendor, "AuthenticAMD"))
270
0
        {
271
0
            if (edx & 0x00400000)
272
0
                cpu |= X265_CPU_MMX2;
273
0
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
274
0
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
275
0
        }
276
0
    }
277
278
0
    if (!strcmp((char*)vendor, "GenuineIntel"))
279
0
    {
280
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
281
0
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
282
0
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
283
0
        if (family == 6)
284
0
        {
285
            /* Detect Atom CPU */
286
0
            if (model == 28)
287
0
            {
288
0
                cpu |= X265_CPU_SLOW_ATOM;
289
0
                cpu |= X265_CPU_SLOW_PSHUFB;
290
0
            }
291
292
            /* Conroe has a slow shuffle unit. Check the model number to make sure not
293
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
294
0
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
295
0
                cpu |= X265_CPU_SLOW_SHUFFLE;
296
0
        }
297
0
    }
298
299
0
    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
300
0
    {
301
        /* cacheline size is specified in 3 places, any of which may be missing */
302
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
303
0
        int cache = (ebx & 0xff00) >> 5; // cflush size
304
0
        if (!cache && max_extended_cap >= 0x80000006)
305
0
        {
306
0
            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
307
0
            cache = ecx & 0xff; // cacheline size
308
0
        }
309
0
        if (!cache && max_basic_cap >= 2)
310
0
        {
311
            // Cache and TLB Information
312
0
            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
313
0
            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
314
0
                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
315
0
            uint32_t buf[4];
316
0
            int max, i = 0;
317
0
            do
318
0
            {
319
0
                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
320
0
                max = buf[0] & 0xff;
321
0
                buf[0] &= ~0xff;
322
0
                for (int j = 0; j < 4; j++)
323
0
                {
324
0
                    if (!(buf[j] >> 31))
325
0
                        while (buf[j])
326
0
                        {
327
0
                            if (strchr(cache32_ids, buf[j] & 0xff))
328
0
                                cache = 32;
329
0
                            if (strchr(cache64_ids, buf[j] & 0xff))
330
0
                                cache = 64;
331
0
                            buf[j] >>= 8;
332
0
                        }
333
0
                }
334
0
            }
335
0
            while (++i < max);
336
0
        }
337
338
0
        if (cache == 32)
339
0
            cpu |= X265_CPU_CACHELINE_32;
340
0
        else if (cache == 64)
341
0
            cpu |= X265_CPU_CACHELINE_64;
342
0
        else
343
0
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
344
0
    }
345
346
#if BROKEN_STACK_ALIGNMENT
347
    cpu |= X265_CPU_STACK_MOD4;
348
#endif
349
350
0
    return cpu;
351
0
}
352
353
#elif X265_ARCH_ARM
354
355
extern "C" {
356
void PFX(cpu_neon_test)(void);
357
int PFX(cpu_fast_neon_mrc_test)(void);
358
}
359
360
uint32_t cpu_detect(bool benableavx512)
361
{
362
    int flags = 0;
363
364
#if HAVE_ARMV6 && ENABLE_ASSEMBLY
365
    flags |= X265_CPU_ARMV6;
366
367
    // don't do this hack if compiled with -mfpu=neon
368
#if !HAVE_NEON
369
    static void (* oldsig)(int);
370
    oldsig = signal(SIGILL, sigill_handler);
371
    if (sigsetjmp(jmpbuf, 1))
372
    {
373
        signal(SIGILL, oldsig);
374
        return flags;
375
    }
376
377
    canjump = 1;
378
    PFX(cpu_neon_test)();
379
    canjump = 0;
380
    signal(SIGILL, oldsig);
381
#endif // if !HAVE_NEON
382
383
    flags |= X265_CPU_NEON;
384
385
    // fast neon -> arm (Cortex-A9) detection relies on user access to the
386
    // cycle counter; this assumes ARMv7 performance counters.
387
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
388
    // hopefully this hacky detection method will have been replaced by then.
389
    // Note that there is potential for a race condition if another program or
390
    // x264 instance disables or reinits the counters while x264 is using them,
391
    // which may result in incorrect detection and the counters stuck enabled.
392
    // right now Apple does not seem to support performance counters for this test
393
#ifndef __MACH__
394
    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
395
#endif
396
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
397
#endif // if HAVE_ARMV6
398
    return flags;
399
}
400
401
#elif X265_ARCH_ARM64
402
#include "aarch64/cpu.h"
403
404
uint32_t cpu_detect(bool benableavx512)
405
{
406
    (void)benableavx512;
407
    int flags = 0;
408
409
#ifdef ENABLE_ASSEMBLY
410
    flags = aarch64_cpu_detect();
411
#endif
412
413
    return flags;
414
}
415
416
#elif X265_ARCH_RISCV64
417
#include "riscv64/cpu.h"
418
419
uint32_t cpu_detect(bool benableavx512)
420
{
421
    (void)benableavx512;
422
    uint32_t flags = 0;
423
424
#ifdef ENABLE_ASSEMBLY
425
    flags = riscv64_cpu_detect();
426
#endif
427
428
    return flags;
429
}
430
431
#elif X265_ARCH_POWER8
432
433
uint32_t cpu_detect(bool benableavx512)
434
{
435
#if HAVE_ALTIVEC
436
    return X265_CPU_ALTIVEC;
437
#else
438
    return 0;
439
#endif
440
}
441
442
#else // if X265_ARCH_POWER8
443
444
uint32_t cpu_detect(bool benableavx512)
445
{
446
    return 0;
447
}
448
449
#endif // if X265_ARCH_X86
450
}
451