Coverage Report

Created: 2022-08-24 06:15

/src/x265/source/common/cpu.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Loren Merritt <lorenm@u.washington.edu>
5
 *          Laurent Aimar <fenrir@via.ecp.fr>
6
 *          Fiona Glaser <fiona@x264.com>
7
 *          Steve Borho <steve@borho.org>
8
 *          Hongbin Liu <liuhongbin1@huawei.com>
9
 *          Yimeng Su <yimeng.su@huawei.com>
10
 *
11
 * This program is free software; you can redistribute it and/or modify
12
 * it under the terms of the GNU General Public License as published by
13
 * the Free Software Foundation; either version 2 of the License, or
14
 * (at your option) any later version.
15
 *
16
 * This program is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
 * GNU General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU General Public License
22
 * along with this program; if not, write to the Free Software
23
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
 *
25
 * This program is also available under a commercial proprietary license.
26
 * For more information, contact us at license @ x265.com.
27
 *****************************************************************************/
28
29
#include "cpu.h"
30
#include "common.h"
31
32
#if MACOS || SYS_FREEBSD
33
#include <sys/types.h>
34
#include <sys/sysctl.h>
35
#endif
36
#if SYS_OPENBSD
37
#include <sys/param.h>
38
#include <sys/sysctl.h>
39
#include <machine/cpu.h>
40
#endif
41
42
#if X265_ARCH_ARM && !defined(HAVE_NEON)
43
#include <signal.h>
44
#include <setjmp.h>
45
static sigjmp_buf jmpbuf;
46
static volatile sig_atomic_t canjump = 0;
47
48
static void sigill_handler(int sig)
49
{
50
    if (!canjump)
51
    {
52
        signal(sig, SIG_DFL);
53
        raise(sig);
54
    }
55
56
    canjump = 0;
57
    siglongjmp(jmpbuf, 1);
58
}
59
60
#endif // if X265_ARCH_ARM
61
62
namespace X265_NS {
63
static bool enable512 = false;
64
const cpu_name_t cpu_names[] =
65
{
66
#if X265_ARCH_X86
67
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
68
    { "MMX2",        MMX2 },
69
    { "MMXEXT",      MMX2 },
70
    { "SSE",         MMX2 | X265_CPU_SSE },
71
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
72
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
73
    { "SSE2",        SSE2 },
74
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
75
    { "LZCNT", X265_CPU_LZCNT },
76
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
77
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
78
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
79
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
80
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
81
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
82
    { "AVX",         AVX },
83
    { "XOP",         AVX | X265_CPU_XOP },
84
    { "FMA4",        AVX | X265_CPU_FMA4 },
85
    { "FMA3",        AVX | X265_CPU_FMA3 },
86
    { "BMI1",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 },
87
    { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
88
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
89
    { "AVX2", AVX2},
90
    { "AVX512", AVX2 | X265_CPU_AVX512 },
91
#undef AVX2
92
#undef AVX
93
#undef SSE2
94
#undef MMX2
95
    { "Cache32",         X265_CPU_CACHELINE_32 },
96
    { "Cache64",         X265_CPU_CACHELINE_64 },
97
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
98
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
99
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
100
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
101
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },
102
103
#elif X265_ARCH_ARM
104
    { "ARMv6",           X265_CPU_ARMV6 },
105
    { "NEON",            X265_CPU_NEON },
106
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
107
108
#elif X265_ARCH_POWER8
109
    { "Altivec",         X265_CPU_ALTIVEC },
110
111
#endif // if X265_ARCH_X86
112
    { "", 0 },
113
};
114
115
#if X265_ARCH_X86
116
117
extern "C" {
118
/* cpu-a.asm */
119
int PFX(cpu_cpuid_test)(void);
120
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
121
uint64_t PFX(cpu_xgetbv)(int xcr);
122
}
123
124
#if defined(_MSC_VER)
125
#pragma warning(disable: 4309) // truncation of constant value
126
#endif
127
128
bool detect512()
129
276k
{
130
276k
    return(enable512);
131
276k
}
132
133
uint32_t cpu_detect(bool benableavx512 )
134
2.79k
{
135
136
2.79k
    uint32_t cpu = 0; 
137
2.79k
    uint32_t eax, ebx, ecx, edx;
138
2.79k
    uint32_t vendor[4] = { 0 };
139
2.79k
    uint32_t max_extended_cap, max_basic_cap;
140
2.79k
    uint64_t xcr0 = 0;
141
142
#if !X86_64
143
    if (!PFX(cpu_cpuid_test)())
144
        return 0;
145
#endif
146
147
2.79k
    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
148
2.79k
    if (max_basic_cap == 0)
149
2.79k
        return 0;
150
151
0
    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
152
0
    if (edx & 0x00800000)
153
0
        cpu |= X265_CPU_MMX;
154
0
    else
155
0
        return cpu;
156
0
    if (edx & 0x02000000)
157
0
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
158
0
    if (edx & 0x04000000)
159
0
        cpu |= X265_CPU_SSE2;
160
0
    if (ecx & 0x00000001)
161
0
        cpu |= X265_CPU_SSE3;
162
0
    if (ecx & 0x00000200)
163
0
        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
164
0
    if (ecx & 0x00080000)
165
0
        cpu |= X265_CPU_SSE4;
166
0
    if (ecx & 0x00100000)
167
0
        cpu |= X265_CPU_SSE42;
168
169
0
    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
170
0
    {
171
        /* Check for OS support */
172
0
        xcr0 = PFX(cpu_xgetbv)(0);
173
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
174
0
        {
175
0
            if (ecx & 0x10000000)
176
0
            cpu |= X265_CPU_AVX;
177
0
            if (ecx & 0x00001000)
178
0
                cpu |= X265_CPU_FMA3;
179
0
        }
180
0
    }
181
182
0
    if (max_basic_cap >= 7)
183
0
    {
184
0
        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
185
        /* AVX2 requires OS support, but BMI1/2 don't. */
186
0
        if (ebx & 0x00000008)
187
0
            cpu |= X265_CPU_BMI1;
188
0
        if (ebx & 0x00000100)
189
0
            cpu |= X265_CPU_BMI2;
190
191
0
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
192
0
        {
193
0
            if (ebx & 0x00000020)
194
0
                cpu |= X265_CPU_AVX2;
195
0
            if (benableavx512)
196
0
            {
197
0
                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
198
0
                {
199
0
                    if ((ebx & 0xD0030000) == 0xD0030000)
200
0
                    {
201
0
                        cpu |= X265_CPU_AVX512;
202
0
                        enable512 = true;
203
0
                    }
204
0
                }
205
0
            }
206
0
        }
207
0
    }
208
209
0
    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
210
0
    max_extended_cap = eax;
211
212
0
    if (max_extended_cap >= 0x80000001)
213
0
    {
214
0
        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
215
216
0
        if (ecx & 0x00000020)
217
0
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
218
0
        if (ecx & 0x00000040) /* SSE4a, AMD only */
219
0
        {
220
0
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
221
0
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
222
0
            if (family == 0x14)
223
0
            {
224
0
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
225
0
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
226
0
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
227
0
            }
228
0
            if (family == 0x16)
229
0
            {
230
0
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
231
                                                * compared to alternate instruction sequences that this
232
                                                * is equal or faster on almost all such functions. */
233
0
            }
234
0
        }
235
236
0
        if (cpu & X265_CPU_AVX)
237
0
        {
238
0
            if (ecx & 0x00000800) /* XOP */
239
0
                cpu |= X265_CPU_XOP;
240
0
            if (ecx & 0x00010000) /* FMA4 */
241
0
                cpu |= X265_CPU_FMA4;
242
0
        }
243
244
0
        if (!strcmp((char*)vendor, "AuthenticAMD"))
245
0
        {
246
0
            if (edx & 0x00400000)
247
0
                cpu |= X265_CPU_MMX2;
248
0
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
249
0
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
250
0
        }
251
0
    }
252
253
0
    if (!strcmp((char*)vendor, "GenuineIntel"))
254
0
    {
255
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
256
0
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
257
0
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
258
0
        if (family == 6)
259
0
        {
260
            /* Detect Atom CPU */
261
0
            if (model == 28)
262
0
            {
263
0
                cpu |= X265_CPU_SLOW_ATOM;
264
0
                cpu |= X265_CPU_SLOW_PSHUFB;
265
0
            }
266
267
            /* Conroe has a slow shuffle unit. Check the model number to make sure not
268
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
269
0
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
270
0
                cpu |= X265_CPU_SLOW_SHUFFLE;
271
0
        }
272
0
    }
273
274
0
    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
275
0
    {
276
        /* cacheline size is specified in 3 places, any of which may be missing */
277
0
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
278
0
        int cache = (ebx & 0xff00) >> 5; // cflush size
279
0
        if (!cache && max_extended_cap >= 0x80000006)
280
0
        {
281
0
            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
282
0
            cache = ecx & 0xff; // cacheline size
283
0
        }
284
0
        if (!cache && max_basic_cap >= 2)
285
0
        {
286
            // Cache and TLB Information
287
0
            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
288
0
            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
289
0
                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
290
0
            uint32_t buf[4];
291
0
            int max, i = 0;
292
0
            do
293
0
            {
294
0
                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
295
0
                max = buf[0] & 0xff;
296
0
                buf[0] &= ~0xff;
297
0
                for (int j = 0; j < 4; j++)
298
0
                {
299
0
                    if (!(buf[j] >> 31))
300
0
                        while (buf[j])
301
0
                        {
302
0
                            if (strchr(cache32_ids, buf[j] & 0xff))
303
0
                                cache = 32;
304
0
                            if (strchr(cache64_ids, buf[j] & 0xff))
305
0
                                cache = 64;
306
0
                            buf[j] >>= 8;
307
0
                        }
308
0
                }
309
0
            }
310
0
            while (++i < max);
311
0
        }
312
313
0
        if (cache == 32)
314
0
            cpu |= X265_CPU_CACHELINE_32;
315
0
        else if (cache == 64)
316
0
            cpu |= X265_CPU_CACHELINE_64;
317
0
        else
318
0
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
319
0
    }
320
321
#if BROKEN_STACK_ALIGNMENT
322
    cpu |= X265_CPU_STACK_MOD4;
323
#endif
324
325
0
    return cpu;
326
0
}
327
328
#elif X265_ARCH_ARM
329
330
extern "C" {
331
void PFX(cpu_neon_test)(void);
332
int PFX(cpu_fast_neon_mrc_test)(void);
333
}
334
335
uint32_t cpu_detect(bool benableavx512)
336
{
337
    int flags = 0;
338
339
#if HAVE_ARMV6
340
    flags |= X265_CPU_ARMV6;
341
342
    // don't do this hack if compiled with -mfpu=neon
343
#if !HAVE_NEON
344
    static void (* oldsig)(int);
345
    oldsig = signal(SIGILL, sigill_handler);
346
    if (sigsetjmp(jmpbuf, 1))
347
    {
348
        signal(SIGILL, oldsig);
349
        return flags;
350
    }
351
352
    canjump = 1;
353
    PFX(cpu_neon_test)();
354
    canjump = 0;
355
    signal(SIGILL, oldsig);
356
#endif // if !HAVE_NEON
357
358
    flags |= X265_CPU_NEON;
359
360
    // fast neon -> arm (Cortex-A9) detection relies on user access to the
361
    // cycle counter; this assumes ARMv7 performance counters.
362
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
363
    // hopefully this hacky detection method will have been replaced by then.
364
    // Note that there is potential for a race condition if another program or
365
    // x264 instance disables or reinits the counters while x264 is using them,
366
    // which may result in incorrect detection and the counters stuck enabled.
367
    // right now Apple does not seem to support performance counters for this test
368
#ifndef __MACH__
369
    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
370
#endif
371
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
372
#elif X265_ARCH_ARM64
373
    flags |= X265_CPU_NEON;
374
#endif // if HAVE_ARMV6
375
    return flags;
376
}
377
378
#elif X265_ARCH_POWER8
379
380
uint32_t cpu_detect(bool benableavx512)
381
{
382
#if HAVE_ALTIVEC
383
    return X265_CPU_ALTIVEC;
384
#else
385
    return 0;
386
#endif
387
}
388
389
#else // if X265_ARCH_POWER8
390
391
uint32_t cpu_detect(bool benableavx512)
392
{
393
    return 0;
394
}
395
396
#endif // if X265_ARCH_X86
397
}
398