Coverage Report

Created: 2025-11-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/x265/source/common/cpu.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Loren Merritt <lorenm@u.washington.edu>
5
 *          Laurent Aimar <fenrir@via.ecp.fr>
6
 *          Fiona Glaser <fiona@x264.com>
7
 *          Steve Borho <steve@borho.org>
8
 *          Hongbin Liu <liuhongbin1@huawei.com>
9
 *          Yimeng Su <yimeng.su@huawei.com>
10
 *          Josh Dekker <josh@itanimul.li>
11
 *          Jean-Baptiste Kempf <jb@videolan.org>
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26
 *
27
 * This program is also available under a commercial proprietary license.
28
 * For more information, contact us at license @ x265.com.
29
 *****************************************************************************/
30
31
#include "cpu.h"
32
#include "common.h"
33
34
#if MACOS || SYS_FREEBSD
35
#include <sys/types.h>
36
#include <sys/sysctl.h>
37
#endif
38
#if SYS_OPENBSD
39
#include <sys/param.h>
40
#include <sys/sysctl.h>
41
#include <machine/cpu.h>
42
#endif
43
44
#if X265_ARCH_ARM && !defined(HAVE_NEON)
45
#include <signal.h>
46
#include <setjmp.h>
47
static sigjmp_buf jmpbuf;
48
static volatile sig_atomic_t canjump = 0;
49
50
static void sigill_handler(int sig)
51
{
52
    if (!canjump)
53
    {
54
        signal(sig, SIG_DFL);
55
        raise(sig);
56
    }
57
58
    canjump = 0;
59
    siglongjmp(jmpbuf, 1);
60
}
61
62
#endif // if X265_ARCH_ARM
63
64
namespace X265_NS {
65
#if X265_ARCH_X86
66
static bool enable512 = false;
67
#endif
68
const cpu_name_t cpu_names[] =
69
{
70
#if X265_ARCH_X86
71
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
72
    { "MMX2",        MMX2 },
73
    { "MMXEXT",      MMX2 },
74
    { "SSE",         MMX2 | X265_CPU_SSE },
75
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
76
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
77
    { "SSE2",        SSE2 },
78
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
79
    { "LZCNT", X265_CPU_LZCNT },
80
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
81
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
82
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
83
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
84
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
85
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
86
    { "AVX",         AVX },
87
    { "XOP",         AVX | X265_CPU_XOP },
88
    { "FMA4",        AVX | X265_CPU_FMA4 },
89
    { "FMA3",        AVX | X265_CPU_FMA3 },
90
    { "BMI1",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 },
91
    { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
92
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
93
    { "AVX2", AVX2},
94
    { "AVX512", AVX2 | X265_CPU_AVX512 },
95
#undef AVX2
96
#undef AVX
97
#undef SSE2
98
#undef MMX2
99
    { "Cache32",         X265_CPU_CACHELINE_32 },
100
    { "Cache64",         X265_CPU_CACHELINE_64 },
101
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
102
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
103
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
104
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
105
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },
106
107
#elif X265_ARCH_ARM
108
    { "ARMv6",           X265_CPU_ARMV6 },
109
    { "NEON",            X265_CPU_NEON },
110
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
111
112
#elif X265_ARCH_ARM64
113
    { "NEON",            X265_CPU_NEON },
114
#if defined(HAVE_SVE)
115
    { "SVE",            X265_CPU_SVE },
116
#endif
117
#if defined(HAVE_SVE2)
118
    { "SVE2",            X265_CPU_SVE2 },
119
#endif
120
#if defined(HAVE_NEON_DOTPROD)
121
    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
122
#endif
123
#if defined(HAVE_NEON_I8MM)
124
    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
125
#endif
126
#if defined(HAVE_SVE2_BITPERM)
127
    { "SVE2_BitPerm",    X265_CPU_SVE2_BITPERM },
128
#endif
129
#elif X265_ARCH_POWER8
130
    { "Altivec",         X265_CPU_ALTIVEC },
131
132
#elif X265_ARCH_RISCV64
133
    { "RVV",           X265_CPU_RVV },
134
135
#endif // if X265_ARCH_X86
136
    { "", 0 },
137
};
138
139
#if X265_ARCH_X86
140
141
extern "C" {
142
/* cpu-a.asm */
143
int PFX(cpu_cpuid_test)(void);
144
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
145
uint64_t PFX(cpu_xgetbv)(int xcr);
146
}
147
148
#if defined(_MSC_VER)
149
#pragma warning(disable: 4309) // truncation of constant value
150
#endif
151
152
bool detect512()
153
0
{
154
0
    return(enable512);
155
0
}
156
157
uint32_t cpu_detect(bool benableavx512 )
158
0
{
159
160
0
    uint32_t cpu = 0; 
161
0
    uint32_t eax, ebx, ecx, edx;
162
0
    uint32_t vendor[4] = { 0 };
163
0
    uint32_t max_extended_cap, max_basic_cap;
164
0
    uint64_t xcr0 = 0;
165
166
#if !X86_64
167
    if (!PFX(cpu_cpuid_test)())
168
        return 0;
169
#endif
170
171
0
    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
172
0
    if (max_basic_cap == 0)
173
0
        return 0;
174
175
0
    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
176
0
    if (edx & 0x00800000)
177
0
        cpu |= X265_CPU_MMX;
178
0
    else
179
0
        return cpu;
180
0
    if (edx & 0x02000000)
181
0
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
182
0
    if (edx & 0x04000000)
183
0
        cpu |= X265_CPU_SSE2;
184
0
    if (ecx & 0x00000001)
185
0
        cpu |= X265_CPU_SSE3;
186
0
    if (ecx & 0x00000200)
187
0
        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
188
0
    if (ecx & 0x00080000)
189
0
        cpu |= X265_CPU_SSE4;
190
0
    if (ecx & 0x00100000)
191
0
        cpu |= X265_CPU_SSE42;
192
193
0
    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
194
0
    {
195
        /* Check for OS support */
196
0
        xcr0 = PFX(cpu_xgetbv)(0);
197
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
198
0
        {
199
0
            if (ecx & 0x10000000)
200
0
            cpu |= X265_CPU_AVX;
201
0
            if (ecx & 0x00001000)
202
0
                cpu |= X265_CPU_FMA3;
203
0
        }
204
0
    }
205
206
0
    if (max_basic_cap >= 7)
207
0
    {
208
0
        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
209
        /* AVX2 requires OS support, but BMI1/2 don't. */
210
0
        if (ebx & 0x00000008)
211
0
            cpu |= X265_CPU_BMI1;
212
0
        if (ebx & 0x00000100)
213
0
            cpu |= X265_CPU_BMI2;
214
215
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
216
0
        {
217
0
            if (ebx & 0x00000020)
218
0
                cpu |= X265_CPU_AVX2;
219
0
            if (benableavx512)
220
0
            {
221
0
                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
222
0
                {
223
0
                    if ((ebx & 0xD0030000) == 0xD0030000)
224
0
                    {
225
0
                        cpu |= X265_CPU_AVX512;
226
0
                        enable512 = true;
227
0
                    }
228
0
                }
229
0
            }
230
0
        }
231
0
    }
232
233
0
    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
234
0
    max_extended_cap = eax;
235
236
0
    if (max_extended_cap >= 0x80000001)
237
0
    {
238
0
        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
239
240
0
        if (ecx & 0x00000020)
241
0
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
242
0
        if (ecx & 0x00000040) /* SSE4a, AMD only */
243
0
        {
244
0
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
245
0
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
246
0
            if (family == 0x14)
247
0
            {
248
0
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
249
0
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
250
0
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
251
0
            }
252
0
            if (family == 0x16)
253
0
            {
254
0
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
255
                                                * compared to alternate instruction sequences that this
256
                                                * is equal or faster on almost all such functions. */
257
0
            }
258
0
        }
259
260
0
        if (cpu & X265_CPU_AVX)
261
0
        {
262
0
            if (ecx & 0x00000800) /* XOP */
263
0
                cpu |= X265_CPU_XOP;
264
0
            if (ecx & 0x00010000) /* FMA4 */
265
0
                cpu |= X265_CPU_FMA4;
266
0
        }
267
268
0
        if (!strcmp((char*)vendor, "AuthenticAMD"))
269
0
        {
270
0
            if (edx & 0x00400000)
271
0
                cpu |= X265_CPU_MMX2;
272
0
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
273
0
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
274
0
        }
275
0
    }
276
277
0
    if (!strcmp((char*)vendor, "GenuineIntel"))
278
0
    {
279
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
280
0
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
281
0
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
282
0
        if (family == 6)
283
0
        {
284
            /* Detect Atom CPU */
285
0
            if (model == 28)
286
0
            {
287
0
                cpu |= X265_CPU_SLOW_ATOM;
288
0
                cpu |= X265_CPU_SLOW_PSHUFB;
289
0
            }
290
291
            /* Conroe has a slow shuffle unit. Check the model number to make sure not
292
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
293
0
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
294
0
                cpu |= X265_CPU_SLOW_SHUFFLE;
295
0
        }
296
0
    }
297
298
0
    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
299
0
    {
300
        /* cacheline size is specified in 3 places, any of which may be missing */
301
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
302
0
        int cache = (ebx & 0xff00) >> 5; // cflush size
303
0
        if (!cache && max_extended_cap >= 0x80000006)
304
0
        {
305
0
            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
306
0
            cache = ecx & 0xff; // cacheline size
307
0
        }
308
0
        if (!cache && max_basic_cap >= 2)
309
0
        {
310
            // Cache and TLB Information
311
0
            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
312
0
            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
313
0
                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
314
0
            uint32_t buf[4];
315
0
            int max, i = 0;
316
0
            do
317
0
            {
318
0
                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
319
0
                max = buf[0] & 0xff;
320
0
                buf[0] &= ~0xff;
321
0
                for (int j = 0; j < 4; j++)
322
0
                {
323
0
                    if (!(buf[j] >> 31))
324
0
                        while (buf[j])
325
0
                        {
326
0
                            if (strchr(cache32_ids, buf[j] & 0xff))
327
0
                                cache = 32;
328
0
                            if (strchr(cache64_ids, buf[j] & 0xff))
329
0
                                cache = 64;
330
0
                            buf[j] >>= 8;
331
0
                        }
332
0
                }
333
0
            }
334
0
            while (++i < max);
335
0
        }
336
337
0
        if (cache == 32)
338
0
            cpu |= X265_CPU_CACHELINE_32;
339
0
        else if (cache == 64)
340
0
            cpu |= X265_CPU_CACHELINE_64;
341
0
        else
342
0
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
343
0
    }
344
345
#if BROKEN_STACK_ALIGNMENT
346
    cpu |= X265_CPU_STACK_MOD4;
347
#endif
348
349
0
    return cpu;
350
0
}
351
352
#elif X265_ARCH_ARM
353
354
extern "C" {
355
void PFX(cpu_neon_test)(void);
356
int PFX(cpu_fast_neon_mrc_test)(void);
357
}
358
359
uint32_t cpu_detect(bool benableavx512)
360
{
361
    int flags = 0;
362
363
#if HAVE_ARMV6 && ENABLE_ASSEMBLY
364
    flags |= X265_CPU_ARMV6;
365
366
    // don't do this hack if compiled with -mfpu=neon
367
#if !HAVE_NEON
368
    static void (* oldsig)(int);
369
    oldsig = signal(SIGILL, sigill_handler);
370
    if (sigsetjmp(jmpbuf, 1))
371
    {
372
        signal(SIGILL, oldsig);
373
        return flags;
374
    }
375
376
    canjump = 1;
377
    PFX(cpu_neon_test)();
378
    canjump = 0;
379
    signal(SIGILL, oldsig);
380
#endif // if !HAVE_NEON
381
382
    flags |= X265_CPU_NEON;
383
384
    // fast neon -> arm (Cortex-A9) detection relies on user access to the
385
    // cycle counter; this assumes ARMv7 performance counters.
386
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
387
    // hopefully this hacky detection method will have been replaced by then.
388
    // Note that there is potential for a race condition if another program or
389
    // x264 instance disables or reinits the counters while x264 is using them,
390
    // which may result in incorrect detection and the counters stuck enabled.
391
    // right now Apple does not seem to support performance counters for this test
392
#ifndef __MACH__
393
    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
394
#endif
395
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
396
#endif // if HAVE_ARMV6
397
    return flags;
398
}
399
400
#elif X265_ARCH_ARM64
401
#include "aarch64/cpu.h"
402
403
uint32_t cpu_detect(bool benableavx512)
404
{
405
    (void)benableavx512;
406
    int flags = 0;
407
408
#ifdef ENABLE_ASSEMBLY
409
    flags = aarch64_cpu_detect();
410
#endif
411
412
    return flags;
413
}
414
415
#elif X265_ARCH_RISCV64
416
#include "riscv64/cpu.h"
417
418
uint32_t cpu_detect(bool benableavx512)
419
{
420
    (void)benableavx512;
421
    uint32_t flags = 0;
422
423
#ifdef ENABLE_ASSEMBLY
424
    flags = riscv64_cpu_detect();
425
#endif
426
427
    return flags;
428
}
429
430
#elif X265_ARCH_POWER8
431
432
uint32_t cpu_detect(bool benableavx512)
433
{
434
#if HAVE_ALTIVEC
435
    return X265_CPU_ALTIVEC;
436
#else
437
    return 0;
438
#endif
439
}
440
441
#else // if X265_ARCH_POWER8
442
443
uint32_t cpu_detect(bool benableavx512)
444
{
445
    return 0;
446
}
447
448
#endif // if X265_ARCH_X86
449
}
450