Coverage Report

Created: 2025-07-23 08:18

/src/x265/source/common/cpu.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Loren Merritt <lorenm@u.washington.edu>
5
 *          Laurent Aimar <fenrir@via.ecp.fr>
6
 *          Fiona Glaser <fiona@x264.com>
7
 *          Steve Borho <steve@borho.org>
8
 *          Hongbin Liu <liuhongbin1@huawei.com>
9
 *          Yimeng Su <yimeng.su@huawei.com>
10
 *          Josh Dekker <josh@itanimul.li>
11
 *          Jean-Baptiste Kempf <jb@videolan.org>
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26
 *
27
 * This program is also available under a commercial proprietary license.
28
 * For more information, contact us at license @ x265.com.
29
 *****************************************************************************/
30
31
#include "cpu.h"
32
#include "common.h"
33
34
#if MACOS || SYS_FREEBSD
35
#include <sys/types.h>
36
#include <sys/sysctl.h>
37
#endif
38
#if SYS_OPENBSD
39
#include <sys/param.h>
40
#include <sys/sysctl.h>
41
#include <machine/cpu.h>
42
#endif
43
44
#if X265_ARCH_ARM && !defined(HAVE_NEON)
45
#include <signal.h>
46
#include <setjmp.h>
47
static sigjmp_buf jmpbuf;
48
static volatile sig_atomic_t canjump = 0;
49
50
static void sigill_handler(int sig)
51
{
52
    if (!canjump)
53
    {
54
        signal(sig, SIG_DFL);
55
        raise(sig);
56
    }
57
58
    canjump = 0;
59
    siglongjmp(jmpbuf, 1);
60
}
61
62
#endif // if X265_ARCH_ARM
63
64
namespace X265_NS {
65
#if X265_ARCH_X86
66
static bool enable512 = false;
67
#endif
68
const cpu_name_t cpu_names[] =
69
{
70
#if X265_ARCH_X86
71
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
72
    { "MMX2",        MMX2 },
73
    { "MMXEXT",      MMX2 },
74
    { "SSE",         MMX2 | X265_CPU_SSE },
75
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
76
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
77
    { "SSE2",        SSE2 },
78
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
79
    { "LZCNT", X265_CPU_LZCNT },
80
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
81
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
82
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
83
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
84
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
85
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
86
    { "AVX",         AVX },
87
    { "XOP",         AVX | X265_CPU_XOP },
88
    { "FMA4",        AVX | X265_CPU_FMA4 },
89
    { "FMA3",        AVX | X265_CPU_FMA3 },
90
    { "BMI1",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 },
91
    { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
92
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
93
    { "AVX2", AVX2},
94
    { "AVX512", AVX2 | X265_CPU_AVX512 },
95
#undef AVX2
96
#undef AVX
97
#undef SSE2
98
#undef MMX2
99
    { "Cache32",         X265_CPU_CACHELINE_32 },
100
    { "Cache64",         X265_CPU_CACHELINE_64 },
101
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
102
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
103
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
104
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
105
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },
106
107
#elif X265_ARCH_ARM
108
    { "ARMv6",           X265_CPU_ARMV6 },
109
    { "NEON",            X265_CPU_NEON },
110
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
111
112
#elif X265_ARCH_ARM64
113
    { "NEON",            X265_CPU_NEON },
114
#if defined(HAVE_SVE)
115
    { "SVE",            X265_CPU_SVE },
116
#endif
117
#if defined(HAVE_SVE2)
118
    { "SVE2",            X265_CPU_SVE2 },
119
#endif
120
#if defined(HAVE_NEON_DOTPROD)
121
    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
122
#endif
123
#if defined(HAVE_NEON_I8MM)
124
    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
125
#endif
126
#if defined(HAVE_SVE2_BITPERM)
127
    { "SVE2_BitPerm",    X265_CPU_SVE2_BITPERM },
128
#endif
129
#elif X265_ARCH_POWER8
130
    { "Altivec",         X265_CPU_ALTIVEC },
131
132
#endif // if X265_ARCH_X86
133
    { "", 0 },
134
};
135
136
#if X265_ARCH_X86
137
138
extern "C" {
139
/* cpu-a.asm */
140
int PFX(cpu_cpuid_test)(void);
141
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
142
uint64_t PFX(cpu_xgetbv)(int xcr);
143
}
144
145
#if defined(_MSC_VER)
146
#pragma warning(disable: 4309) // truncation of constant value
147
#endif
148
149
bool detect512()
150
0
{
151
0
    return(enable512);
152
0
}
153
154
uint32_t cpu_detect(bool benableavx512 )
155
0
{
156
157
0
    uint32_t cpu = 0; 
158
0
    uint32_t eax, ebx, ecx, edx;
159
0
    uint32_t vendor[4] = { 0 };
160
0
    uint32_t max_extended_cap, max_basic_cap;
161
0
    uint64_t xcr0 = 0;
162
163
#if !X86_64
164
    if (!PFX(cpu_cpuid_test)())
165
        return 0;
166
#endif
167
168
0
    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
169
0
    if (max_basic_cap == 0)
170
0
        return 0;
171
172
0
    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
173
0
    if (edx & 0x00800000)
174
0
        cpu |= X265_CPU_MMX;
175
0
    else
176
0
        return cpu;
177
0
    if (edx & 0x02000000)
178
0
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
179
0
    if (edx & 0x04000000)
180
0
        cpu |= X265_CPU_SSE2;
181
0
    if (ecx & 0x00000001)
182
0
        cpu |= X265_CPU_SSE3;
183
0
    if (ecx & 0x00000200)
184
0
        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
185
0
    if (ecx & 0x00080000)
186
0
        cpu |= X265_CPU_SSE4;
187
0
    if (ecx & 0x00100000)
188
0
        cpu |= X265_CPU_SSE42;
189
190
0
    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
191
0
    {
192
        /* Check for OS support */
193
0
        xcr0 = PFX(cpu_xgetbv)(0);
194
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
195
0
        {
196
0
            if (ecx & 0x10000000)
197
0
            cpu |= X265_CPU_AVX;
198
0
            if (ecx & 0x00001000)
199
0
                cpu |= X265_CPU_FMA3;
200
0
        }
201
0
    }
202
203
0
    if (max_basic_cap >= 7)
204
0
    {
205
0
        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
206
        /* AVX2 requires OS support, but BMI1/2 don't. */
207
0
        if (ebx & 0x00000008)
208
0
            cpu |= X265_CPU_BMI1;
209
0
        if (ebx & 0x00000100)
210
0
            cpu |= X265_CPU_BMI2;
211
212
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
213
0
        {
214
0
            if (ebx & 0x00000020)
215
0
                cpu |= X265_CPU_AVX2;
216
0
            if (benableavx512)
217
0
            {
218
0
                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
219
0
                {
220
0
                    if ((ebx & 0xD0030000) == 0xD0030000)
221
0
                    {
222
0
                        cpu |= X265_CPU_AVX512;
223
0
                        enable512 = true;
224
0
                    }
225
0
                }
226
0
            }
227
0
        }
228
0
    }
229
230
0
    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
231
0
    max_extended_cap = eax;
232
233
0
    if (max_extended_cap >= 0x80000001)
234
0
    {
235
0
        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
236
237
0
        if (ecx & 0x00000020)
238
0
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
239
0
        if (ecx & 0x00000040) /* SSE4a, AMD only */
240
0
        {
241
0
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
242
0
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
243
0
            if (family == 0x14)
244
0
            {
245
0
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
246
0
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
247
0
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
248
0
            }
249
0
            if (family == 0x16)
250
0
            {
251
0
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
252
                                                * compared to alternate instruction sequences that this
253
                                                * is equal or faster on almost all such functions. */
254
0
            }
255
0
        }
256
257
0
        if (cpu & X265_CPU_AVX)
258
0
        {
259
0
            if (ecx & 0x00000800) /* XOP */
260
0
                cpu |= X265_CPU_XOP;
261
0
            if (ecx & 0x00010000) /* FMA4 */
262
0
                cpu |= X265_CPU_FMA4;
263
0
        }
264
265
0
        if (!strcmp((char*)vendor, "AuthenticAMD"))
266
0
        {
267
0
            if (edx & 0x00400000)
268
0
                cpu |= X265_CPU_MMX2;
269
0
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
270
0
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
271
0
        }
272
0
    }
273
274
0
    if (!strcmp((char*)vendor, "GenuineIntel"))
275
0
    {
276
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
277
0
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
278
0
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
279
0
        if (family == 6)
280
0
        {
281
            /* Detect Atom CPU */
282
0
            if (model == 28)
283
0
            {
284
0
                cpu |= X265_CPU_SLOW_ATOM;
285
0
                cpu |= X265_CPU_SLOW_PSHUFB;
286
0
            }
287
288
            /* Conroe has a slow shuffle unit. Check the model number to make sure not
289
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
290
0
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
291
0
                cpu |= X265_CPU_SLOW_SHUFFLE;
292
0
        }
293
0
    }
294
295
0
    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
296
0
    {
297
        /* cacheline size is specified in 3 places, any of which may be missing */
298
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
299
0
        int cache = (ebx & 0xff00) >> 5; // cflush size
300
0
        if (!cache && max_extended_cap >= 0x80000006)
301
0
        {
302
0
            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
303
0
            cache = ecx & 0xff; // cacheline size
304
0
        }
305
0
        if (!cache && max_basic_cap >= 2)
306
0
        {
307
            // Cache and TLB Information
308
0
            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
309
0
            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
310
0
                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
311
0
            uint32_t buf[4];
312
0
            int max, i = 0;
313
0
            do
314
0
            {
315
0
                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
316
0
                max = buf[0] & 0xff;
317
0
                buf[0] &= ~0xff;
318
0
                for (int j = 0; j < 4; j++)
319
0
                {
320
0
                    if (!(buf[j] >> 31))
321
0
                        while (buf[j])
322
0
                        {
323
0
                            if (strchr(cache32_ids, buf[j] & 0xff))
324
0
                                cache = 32;
325
0
                            if (strchr(cache64_ids, buf[j] & 0xff))
326
0
                                cache = 64;
327
0
                            buf[j] >>= 8;
328
0
                        }
329
0
                }
330
0
            }
331
0
            while (++i < max);
332
0
        }
333
334
0
        if (cache == 32)
335
0
            cpu |= X265_CPU_CACHELINE_32;
336
0
        else if (cache == 64)
337
0
            cpu |= X265_CPU_CACHELINE_64;
338
0
        else
339
0
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
340
0
    }
341
342
#if BROKEN_STACK_ALIGNMENT
343
    cpu |= X265_CPU_STACK_MOD4;
344
#endif
345
346
0
    return cpu;
347
0
}
348
349
#elif X265_ARCH_ARM
350
351
extern "C" {
352
void PFX(cpu_neon_test)(void);
353
int PFX(cpu_fast_neon_mrc_test)(void);
354
}
355
356
uint32_t cpu_detect(bool benableavx512)
357
{
358
    int flags = 0;
359
360
#if HAVE_ARMV6 && ENABLE_ASSEMBLY
361
    flags |= X265_CPU_ARMV6;
362
363
    // don't do this hack if compiled with -mfpu=neon
364
#if !HAVE_NEON
365
    static void (* oldsig)(int);
366
    oldsig = signal(SIGILL, sigill_handler);
367
    if (sigsetjmp(jmpbuf, 1))
368
    {
369
        signal(SIGILL, oldsig);
370
        return flags;
371
    }
372
373
    canjump = 1;
374
    PFX(cpu_neon_test)();
375
    canjump = 0;
376
    signal(SIGILL, oldsig);
377
#endif // if !HAVE_NEON
378
379
    flags |= X265_CPU_NEON;
380
381
    // fast neon -> arm (Cortex-A9) detection relies on user access to the
382
    // cycle counter; this assumes ARMv7 performance counters.
383
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
384
    // hopefully this hacky detection method will have been replaced by then.
385
    // Note that there is potential for a race condition if another program or
386
    // x264 instance disables or reinits the counters while x264 is using them,
387
    // which may result in incorrect detection and the counters stuck enabled.
388
    // right now Apple does not seem to support performance counters for this test
389
#ifndef __MACH__
390
    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
391
#endif
392
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
393
#endif // if HAVE_ARMV6
394
    return flags;
395
}
396
397
#elif X265_ARCH_ARM64
398
#include "aarch64/cpu.h"
399
400
uint32_t cpu_detect(bool benableavx512)
401
{
402
    (void)benableavx512;
403
    int flags = 0;
404
405
#ifdef ENABLE_ASSEMBLY
406
    flags = aarch64_cpu_detect();
407
#endif
408
409
    return flags;
410
}
411
412
#elif X265_ARCH_POWER8
413
414
uint32_t cpu_detect(bool benableavx512)
415
{
416
#if HAVE_ALTIVEC
417
    return X265_CPU_ALTIVEC;
418
#else
419
    return 0;
420
#endif
421
}
422
423
#else // if X265_ARCH_POWER8
424
425
uint32_t cpu_detect(bool benableavx512)
426
{
427
    return 0;
428
}
429
430
#endif // if X265_ARCH_X86
431
}
432