/src/x265/source/common/cpu.cpp

Source
/*****************************************************************************
 * Copyright (C) 2013-2020 MulticoreWare, Inc
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Fiona Glaser <fiona@x264.com>
 *          Steve Borho <steve@borho.org>
 *          Hongbin Liu <liuhongbin1@huawei.com>
 *          Yimeng Su <yimeng.su@huawei.com>
 *          Josh Dekker <josh@itanimul.li>
 *          Jean-Baptiste Kempf <jb@videolan.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "cpu.h"
#include "common.h"

#if MACOS || SYS_FREEBSD
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if SYS_OPENBSD
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif

#if X265_ARCH_ARM && !defined(HAVE_NEON)
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;

static void sigill_handler(int sig)
{
    if (!canjump)
    {
        signal(sig, SIG_DFL);
        raise(sig);
    }

    canjump = 0;
    siglongjmp(jmpbuf, 1);
}

#endif // if X265_ARCH_ARM

namespace X265_NS {
#if X265_ARCH_X86
static bool enable512 = false;
#endif
const cpu_name_t cpu_names[] =
{
#if X265_ARCH_X86
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
    { "MMX2",        MMX2 },
    { "MMXEXT",      MMX2 },
    { "SSE",         MMX2 | X265_CPU_SSE },
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
    { "SSE2",        SSE2 },
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
    { "LZCNT", X265_CPU_LZCNT },
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
    { "AVX",         AVX },
    { "XOP",         AVX | X265_CPU_XOP },
    { "FMA4",        AVX | X265_CPU_FMA4 },
    { "FMA3",        AVX | X265_CPU_FMA3 },
    { "BMI1",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 },
    { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
    { "AVX2", AVX2},
    { "AVX512", AVX2 | X265_CPU_AVX512 },
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
    { "Cache32",         X265_CPU_CACHELINE_32 },
    { "Cache64",         X265_CPU_CACHELINE_64 },
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },

#elif X265_ARCH_ARM
    { "ARMv6",           X265_CPU_ARMV6 },
    { "NEON",            X265_CPU_NEON },
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },

#elif X265_ARCH_ARM64
    { "NEON",            X265_CPU_NEON },
#if defined(HAVE_SVE)
    { "SVE",            X265_CPU_SVE },
#endif
#if defined(HAVE_SVE2)
    { "SVE2",            X265_CPU_SVE2 },
#endif
#if defined(HAVE_NEON_DOTPROD)
    { "Neon_DotProd",    X265_CPU_NEON_DOTPROD },
#endif
#if defined(HAVE_NEON_I8MM)
    { "Neon_I8MM",       X265_CPU_NEON_I8MM },
#endif
#if defined(HAVE_SVE2_BITPERM)
    { "SVE2_BitPerm",    X265_CPU_SVE2_BITPERM },
#endif
#elif X265_ARCH_POWER8
    { "Altivec",         X265_CPU_ALTIVEC },

#elif X265_ARCH_RISCV64
    { "RVV",           X265_CPU_RVV },
    { "Zbb",           X265_CPU_ZBB },

#endif // if X265_ARCH_X86
    { "", 0 },
};

#if X265_ARCH_X86

extern "C" {
/* cpu-a.asm */
int PFX(cpu_cpuid_test)(void);
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
uint64_t PFX(cpu_xgetbv)(int xcr);
}

#if defined(_MSC_VER)
#pragma warning(disable: 4309) // truncation of constant value
#endif

bool detect512()
{
    return(enable512);
}

uint32_t cpu_detect(bool benableavx512 )
{

    uint32_t cpu = 0; 
    uint32_t eax, ebx, ecx, edx;
    uint32_t vendor[4] = { 0 };
    uint32_t max_extended_cap, max_basic_cap;
    uint64_t xcr0 = 0;

#if !X86_64
    if (!PFX(cpu_cpuid_test)())
        return 0;
#endif

    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
    if (max_basic_cap == 0)
        return 0;

    PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
    if (edx & 0x00800000)
        cpu |= X265_CPU_MMX;
    else
        return cpu;
    if (edx & 0x02000000)
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
    if (edx & 0x04000000)
        cpu |= X265_CPU_SSE2;
    if (ecx & 0x00000001)
        cpu |= X265_CPU_SSE3;
    if (ecx & 0x00000200)
        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
    if (ecx & 0x00080000)
        cpu |= X265_CPU_SSE4;
    if (ecx & 0x00100000)
        cpu |= X265_CPU_SSE42;

    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
    {
        /* Check for OS support */
        xcr0 = PFX(cpu_xgetbv)(0);
        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
        {
            if (ecx & 0x10000000)
            cpu |= X265_CPU_AVX;
            if (ecx & 0x00001000)
                cpu |= X265_CPU_FMA3;
        }
    }

    if (max_basic_cap >= 7)
    {
        PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
        /* AVX2 requires OS support, but BMI1/2 don't. */
        if (ebx & 0x00000008)
            cpu |= X265_CPU_BMI1;
        if (ebx & 0x00000100)
            cpu |= X265_CPU_BMI2;

        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
        {
            if (ebx & 0x00000020)
                cpu |= X265_CPU_AVX2;
            if (benableavx512)
            {
                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
                {
                    if ((ebx & 0xD0030000) == 0xD0030000)
                    {
                        cpu |= X265_CPU_AVX512;
                        enable512 = true;
                    }
                }
            }
        }
    }

    PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
    max_extended_cap = eax;

    if (max_extended_cap >= 0x80000001)
    {
        PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);

        if (ecx & 0x00000020)
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
        if (ecx & 0x00000040) /* SSE4a, AMD only */
        {
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
            if (family == 0x14)
            {
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
            }
            if (family == 0x16)
            {
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
                                                * compared to alternate instruction sequences that this
                                                * is equal or faster on almost all such functions. */
            }
        }

        if (cpu & X265_CPU_AVX)
        {
            if (ecx & 0x00000800) /* XOP */
                cpu |= X265_CPU_XOP;
            if (ecx & 0x00010000) /* FMA4 */
                cpu |= X265_CPU_FMA4;
        }

        if (!strcmp((char*)vendor, "AuthenticAMD"))
        {
            if (edx & 0x00400000)
                cpu |= X265_CPU_MMX2;
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
        }
    }

    if (!strcmp((char*)vendor, "GenuineIntel"))
    {
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
        if (family == 6)
        {
            /* Detect Atom CPU */
            if (model == 28)
            {
                cpu |= X265_CPU_SLOW_ATOM;
                cpu |= X265_CPU_SLOW_PSHUFB;
            }

            /* Conroe has a slow shuffle unit. Check the model number to make sure not
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
                cpu |= X265_CPU_SLOW_SHUFFLE;
        }
    }

    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
    {
        /* cacheline size is specified in 3 places, any of which may be missing */
        PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
        int cache = (ebx & 0xff00) >> 5; // cflush size
        if (!cache && max_extended_cap >= 0x80000006)
        {
            PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
            cache = ecx & 0xff; // cacheline size
        }
        if (!cache && max_basic_cap >= 2)
        {
            // Cache and TLB Information
            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
            uint32_t buf[4];
            int max, i = 0;
            do
            {
                PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
                max = buf[0] & 0xff;
                buf[0] &= ~0xff;
                for (int j = 0; j < 4; j++)
                {
                    if (!(buf[j] >> 31))
                        while (buf[j])
                        {
                            if (strchr(cache32_ids, buf[j] & 0xff))
                                cache = 32;
                            if (strchr(cache64_ids, buf[j] & 0xff))
                                cache = 64;
                            buf[j] >>= 8;
                        }
                }
            }
            while (++i < max);
        }

        if (cache == 32)
            cpu |= X265_CPU_CACHELINE_32;
        else if (cache == 64)
            cpu |= X265_CPU_CACHELINE_64;
        else
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
    }

#if BROKEN_STACK_ALIGNMENT
    cpu |= X265_CPU_STACK_MOD4;
#endif

    return cpu;
}

#elif X265_ARCH_ARM

extern "C" {
void PFX(cpu_neon_test)(void);
int PFX(cpu_fast_neon_mrc_test)(void);
}

uint32_t cpu_detect(bool benableavx512)
{
    int flags = 0;

#if HAVE_ARMV6 && ENABLE_ASSEMBLY
    flags |= X265_CPU_ARMV6;

    // don't do this hack if compiled with -mfpu=neon
#if !HAVE_NEON
    static void (* oldsig)(int);
    oldsig = signal(SIGILL, sigill_handler);
    if (sigsetjmp(jmpbuf, 1))
    {
        signal(SIGILL, oldsig);
        return flags;
    }

    canjump = 1;
    PFX(cpu_neon_test)();
    canjump = 0;
    signal(SIGILL, oldsig);
#endif // if !HAVE_NEON

    flags |= X265_CPU_NEON;

    // fast neon -> arm (Cortex-A9) detection relies on user access to the
    // cycle counter; this assumes ARMv7 performance counters.
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
    // hopefully this hacky detection method will have been replaced by then.
    // Note that there is potential for a race condition if another program or
    // x264 instance disables or reinits the counters while x264 is using them,
    // which may result in incorrect detection and the counters stuck enabled.
    // right now Apple does not seem to support performance counters for this test
#ifndef __MACH__
    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
#endif
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
#endif // if HAVE_ARMV6
    return flags;
}

#elif X265_ARCH_ARM64
#include "aarch64/cpu.h"

uint32_t cpu_detect(bool benableavx512)
{
    (void)benableavx512;
    int flags = 0;

#ifdef ENABLE_ASSEMBLY
    flags = aarch64_cpu_detect();
#endif

    return flags;
}

#elif X265_ARCH_RISCV64
#include "riscv64/cpu.h"

uint32_t cpu_detect(bool benableavx512)
{
    (void)benableavx512;
    uint32_t flags = 0;

#ifdef ENABLE_ASSEMBLY
    flags = riscv64_cpu_detect();
#endif

    return flags;
}

#elif X265_ARCH_POWER8

uint32_t cpu_detect(bool benableavx512)
{
#if HAVE_ALTIVEC
    return X265_CPU_ALTIVEC;
#else
    return 0;
#endif
}

#else // if X265_ARCH_POWER8

uint32_t cpu_detect(bool benableavx512)
{
    return 0;
}

#endif // if X265_ARCH_X86
}


Coverage Report

Created: 2026-01-20 07:37

Line	Count	Source
1		/*****************************************************************************
2		* Copyright (C) 2013-2020 MulticoreWare, Inc
3		*
4		* Authors: Loren Merritt <lorenm@u.washington.edu>
5		* Laurent Aimar <fenrir@via.ecp.fr>
6		* Fiona Glaser <fiona@x264.com>
7		* Steve Borho <steve@borho.org>
8		* Hongbin Liu <liuhongbin1@huawei.com>
9		* Yimeng Su <yimeng.su@huawei.com>
10		* Josh Dekker <josh@itanimul.li>
11		* Jean-Baptiste Kempf <jb@videolan.org>
12		*
13		* This program is free software; you can redistribute it and/or modify
14		* it under the terms of the GNU General Public License as published by
15		* the Free Software Foundation; either version 2 of the License, or
16		* (at your option) any later version.
17		*
18		* This program is distributed in the hope that it will be useful,
19		* but WITHOUT ANY WARRANTY; without even the implied warranty of
20		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21		* GNU General Public License for more details.
22		*
23		* You should have received a copy of the GNU General Public License
24		* along with this program; if not, write to the Free Software
25		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26		*
27		* This program is also available under a commercial proprietary license.
28		* For more information, contact us at license @ x265.com.
29		*****************************************************************************/
30
31		#include "cpu.h"
32		#include "common.h"
33
34		#if MACOS \|\| SYS_FREEBSD
35		#include <sys/types.h>
36		#include <sys/sysctl.h>
37		#endif
38		#if SYS_OPENBSD
39		#include <sys/param.h>
40		#include <sys/sysctl.h>
41		#include <machine/cpu.h>
42		#endif
43
44		#if X265_ARCH_ARM && !defined(HAVE_NEON)
45		#include <signal.h>
46		#include <setjmp.h>
47		static sigjmp_buf jmpbuf;
48		static volatile sig_atomic_t canjump = 0;
49
50		static void sigill_handler(int sig)
51		{
52		if (!canjump)
53		{
54		signal(sig, SIG_DFL);
55		raise(sig);
56		}
57
58		canjump = 0;
59		siglongjmp(jmpbuf, 1);
60		}
61
62		#endif // if X265_ARCH_ARM
63
64		namespace X265_NS {
65		#if X265_ARCH_X86
66		static bool enable512 = false;
67		#endif
68		const cpu_name_t cpu_names[] =
69		{
70		#if X265_ARCH_X86
71		#define MMX2 X265_CPU_MMX \| X265_CPU_MMX2
72		{ "MMX2", MMX2 },
73		{ "MMXEXT", MMX2 },
74		{ "SSE", MMX2 \| X265_CPU_SSE },
75		#define SSE2 MMX2 \| X265_CPU_SSE \| X265_CPU_SSE2
76		{ "SSE2Slow", SSE2 \| X265_CPU_SSE2_IS_SLOW },
77		{ "SSE2", SSE2 },
78		{ "SSE2Fast", SSE2 \| X265_CPU_SSE2_IS_FAST },
79		{ "LZCNT", X265_CPU_LZCNT },
80		{ "SSE3", SSE2 \| X265_CPU_SSE3 },
81		{ "SSSE3", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 },
82		{ "SSE4.1", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 },
83		{ "SSE4", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 },
84		{ "SSE4.2", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 \| X265_CPU_SSE42 },
85		#define AVX SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 \| X265_CPU_SSE42 \| X265_CPU_AVX
86		{ "AVX", AVX },
87		{ "XOP", AVX \| X265_CPU_XOP },
88		{ "FMA4", AVX \| X265_CPU_FMA4 },
89		{ "FMA3", AVX \| X265_CPU_FMA3 },
90		{ "BMI1", AVX \| X265_CPU_LZCNT \| X265_CPU_BMI1 },
91		{ "BMI2", AVX \| X265_CPU_LZCNT \| X265_CPU_BMI1 \| X265_CPU_BMI2 },
92		#define AVX2 AVX \| X265_CPU_FMA3 \| X265_CPU_LZCNT \| X265_CPU_BMI1 \| X265_CPU_BMI2 \| X265_CPU_AVX2
93		{ "AVX2", AVX2},
94		{ "AVX512", AVX2 \| X265_CPU_AVX512 },
95		#undef AVX2
96		#undef AVX
97		#undef SSE2
98		#undef MMX2
99		{ "Cache32", X265_CPU_CACHELINE_32 },
100		{ "Cache64", X265_CPU_CACHELINE_64 },
101		{ "SlowAtom", X265_CPU_SLOW_ATOM },
102		{ "SlowPshufb", X265_CPU_SLOW_PSHUFB },
103		{ "SlowPalignr", X265_CPU_SLOW_PALIGNR },
104		{ "SlowShuffle", X265_CPU_SLOW_SHUFFLE },
105		{ "UnalignedStack", X265_CPU_STACK_MOD4 },
106
107		#elif X265_ARCH_ARM
108		{ "ARMv6", X265_CPU_ARMV6 },
109		{ "NEON", X265_CPU_NEON },
110		{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
111
112		#elif X265_ARCH_ARM64
113		{ "NEON", X265_CPU_NEON },
114		#if defined(HAVE_SVE)
115		{ "SVE", X265_CPU_SVE },
116		#endif
117		#if defined(HAVE_SVE2)
118		{ "SVE2", X265_CPU_SVE2 },
119		#endif
120		#if defined(HAVE_NEON_DOTPROD)
121		{ "Neon_DotProd", X265_CPU_NEON_DOTPROD },
122		#endif
123		#if defined(HAVE_NEON_I8MM)
124		{ "Neon_I8MM", X265_CPU_NEON_I8MM },
125		#endif
126		#if defined(HAVE_SVE2_BITPERM)
127		{ "SVE2_BitPerm", X265_CPU_SVE2_BITPERM },
128		#endif
129		#elif X265_ARCH_POWER8
130		{ "Altivec", X265_CPU_ALTIVEC },
131
132		#elif X265_ARCH_RISCV64
133		{ "RVV", X265_CPU_RVV },
134		{ "Zbb", X265_CPU_ZBB },
135
136		#endif // if X265_ARCH_X86
137		{ "", 0 },
138		};
139
140		#if X265_ARCH_X86
141
142		extern "C" {
143		/* cpu-a.asm */
144		int PFX(cpu_cpuid_test)(void);
145		void PFX(cpu_cpuid)(uint32_t op, uint32_t eax, uint32_t ebx, uint32_t ecx, uint32_t edx);
146		uint64_t PFX(cpu_xgetbv)(int xcr);
147		}
148
149		#if defined(_MSC_VER)
150		#pragma warning(disable: 4309) // truncation of constant value
151		#endif
152
153		bool detect512()
154	0	{
155	0	return(enable512);
156	0	}
157
158		uint32_t cpu_detect(bool benableavx512 )
159	0	{
160
161	0	uint32_t cpu = 0;
162	0	uint32_t eax, ebx, ecx, edx;
163	0	uint32_t vendor[4] = { 0 };
164	0	uint32_t max_extended_cap, max_basic_cap;
165	0	uint64_t xcr0 = 0;
166
167		#if !X86_64
168		if (!PFX(cpu_cpuid_test)())
169		return 0;
170		#endif
171
172	0	PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
173	0	if (max_basic_cap == 0)
174	0	return 0;
175
176	0	PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
177	0	if (edx & 0x00800000)
178	0	cpu \|= X265_CPU_MMX;
179	0	else
180	0	return cpu;
181	0	if (edx & 0x02000000)
182	0	cpu \|= X265_CPU_MMX2 \| X265_CPU_SSE;
183	0	if (edx & 0x04000000)
184	0	cpu \|= X265_CPU_SSE2;
185	0	if (ecx & 0x00000001)
186	0	cpu \|= X265_CPU_SSE3;
187	0	if (ecx & 0x00000200)
188	0	cpu \|= X265_CPU_SSSE3 \| X265_CPU_SSE2_IS_FAST;
189	0	if (ecx & 0x00080000)
190	0	cpu \|= X265_CPU_SSE4;
191	0	if (ecx & 0x00100000)
192	0	cpu \|= X265_CPU_SSE42;
193
194	0	if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
195	0	{
196		/* Check for OS support */
197	0	xcr0 = PFX(cpu_xgetbv)(0);
198	0	if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
199	0	{
200	0	if (ecx & 0x10000000)
201	0	cpu \|= X265_CPU_AVX;
202	0	if (ecx & 0x00001000)
203	0	cpu \|= X265_CPU_FMA3;
204	0	}
205	0	}
206
207	0	if (max_basic_cap >= 7)
208	0	{
209	0	PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
210		/* AVX2 requires OS support, but BMI1/2 don't. */
211	0	if (ebx & 0x00000008)
212	0	cpu \|= X265_CPU_BMI1;
213	0	if (ebx & 0x00000100)
214	0	cpu \|= X265_CPU_BMI2;
215
216	0	if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
217	0	{
218	0	if (ebx & 0x00000020)
219	0	cpu \|= X265_CPU_AVX2;
220	0	if (benableavx512)
221	0	{
222	0	if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
223	0	{
224	0	if ((ebx & 0xD0030000) == 0xD0030000)
225	0	{
226	0	cpu \|= X265_CPU_AVX512;
227	0	enable512 = true;
228	0	}
229	0	}
230	0	}
231	0	}
232	0	}
233
234	0	PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
235	0	max_extended_cap = eax;
236
237	0	if (max_extended_cap >= 0x80000001)
238	0	{
239	0	PFX(cpu_cpuid)(0x80000001, &eax, &ebx, &ecx, &edx);
240
241	0	if (ecx & 0x00000020)
242	0	cpu \|= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
243	0	if (ecx & 0x00000040) /* SSE4a, AMD only */
244	0	{
245	0	int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
246	0	cpu \|= X265_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
247	0	if (family == 0x14)
248	0	{
249	0	cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
250	0	cpu \|= X265_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
251	0	cpu \|= X265_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
252	0	}
253	0	if (family == 0x16)
254	0	{
255	0	cpu \|= X265_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
256		* compared to alternate instruction sequences that this
257		* is equal or faster on almost all such functions. */
258	0	}
259	0	}
260
261	0	if (cpu & X265_CPU_AVX)
262	0	{
263	0	if (ecx & 0x00000800) /* XOP */
264	0	cpu \|= X265_CPU_XOP;
265	0	if (ecx & 0x00010000) /* FMA4 */
266	0	cpu \|= X265_CPU_FMA4;
267	0	}
268
269	0	if (!strcmp((char*)vendor, "AuthenticAMD"))
270	0	{
271	0	if (edx & 0x00400000)
272	0	cpu \|= X265_CPU_MMX2;
273	0	if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
274	0	cpu \|= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
275	0	}
276	0	}
277
278	0	if (!strcmp((char*)vendor, "GenuineIntel"))
279	0	{
280	0	PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
281	0	int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
282	0	int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
283	0	if (family == 6)
284	0	{
285		/* Detect Atom CPU */
286	0	if (model == 28)
287	0	{
288	0	cpu \|= X265_CPU_SLOW_ATOM;
289	0	cpu \|= X265_CPU_SLOW_PSHUFB;
290	0	}
291
292		/* Conroe has a slow shuffle unit. Check the model number to make sure not
293		* to include crippled low-end Penryns and Nehalems that don't have SSE4. */
294	0	else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
295	0	cpu \|= X265_CPU_SLOW_SHUFFLE;
296	0	}
297	0	}
298
299	0	if ((!strcmp((char)vendor, "GenuineIntel") \|\| !strcmp((char)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
300	0	{
301		/* cacheline size is specified in 3 places, any of which may be missing */
302	0	PFX(cpu_cpuid)(1, &eax, &ebx, &ecx, &edx);
303	0	int cache = (ebx & 0xff00) >> 5; // cflush size
304	0	if (!cache && max_extended_cap >= 0x80000006)
305	0	{
306	0	PFX(cpu_cpuid)(0x80000006, &eax, &ebx, &ecx, &edx);
307	0	cache = ecx & 0xff; // cacheline size
308	0	}
309	0	if (!cache && max_basic_cap >= 2)
310	0	{
311		// Cache and TLB Information
312	0	static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
313	0	static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
314	0	'\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
315	0	uint32_t buf[4];
316	0	int max, i = 0;
317	0	do
318	0	{
319	0	PFX(cpu_cpuid)(2, buf + 0, buf + 1, buf + 2, buf + 3);
320	0	max = buf[0] & 0xff;
321	0	buf[0] &= ~0xff;
322	0	for (int j = 0; j < 4; j++)
323	0	{
324	0	if (!(buf[j] >> 31))
325	0	while (buf[j])
326	0	{
327	0	if (strchr(cache32_ids, buf[j] & 0xff))
328	0	cache = 32;
329	0	if (strchr(cache64_ids, buf[j] & 0xff))
330	0	cache = 64;
331	0	buf[j] >>= 8;
332	0	}
333	0	}
334	0	}
335	0	while (++i < max);
336	0	}
337
338	0	if (cache == 32)
339	0	cpu \|= X265_CPU_CACHELINE_32;
340	0	else if (cache == 64)
341	0	cpu \|= X265_CPU_CACHELINE_64;
342	0	else
343	0	x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
344	0	}
345
346		#if BROKEN_STACK_ALIGNMENT
347		cpu \|= X265_CPU_STACK_MOD4;
348		#endif
349
350	0	return cpu;
351	0	}
352
353		#elif X265_ARCH_ARM
354
355		extern "C" {
356		void PFX(cpu_neon_test)(void);
357		int PFX(cpu_fast_neon_mrc_test)(void);
358		}
359
360		uint32_t cpu_detect(bool benableavx512)
361		{
362		int flags = 0;
363
364		#if HAVE_ARMV6 && ENABLE_ASSEMBLY
365		flags \|= X265_CPU_ARMV6;
366
367		// don't do this hack if compiled with -mfpu=neon
368		#if !HAVE_NEON
369		static void (* oldsig)(int);
370		oldsig = signal(SIGILL, sigill_handler);
371		if (sigsetjmp(jmpbuf, 1))
372		{
373		signal(SIGILL, oldsig);
374		return flags;
375		}
376
377		canjump = 1;
378		PFX(cpu_neon_test)();
379		canjump = 0;
380		signal(SIGILL, oldsig);
381		#endif // if !HAVE_NEON
382
383		flags \|= X265_CPU_NEON;
384
385		// fast neon -> arm (Cortex-A9) detection relies on user access to the
386		// cycle counter; this assumes ARMv7 performance counters.
387		// NEON requires at least ARMv7, ARMv8 may require changes here, but
388		// hopefully this hacky detection method will have been replaced by then.
389		// Note that there is potential for a race condition if another program or
390		// x264 instance disables or reinits the counters while x264 is using them,
391		// which may result in incorrect detection and the counters stuck enabled.
392		// right now Apple does not seem to support performance counters for this test
393		#ifndef __MACH__
394		flags \|= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
395		#endif
396		// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
397		#endif // if HAVE_ARMV6
398		return flags;
399		}
400
401		#elif X265_ARCH_ARM64
402		#include "aarch64/cpu.h"
403
404		uint32_t cpu_detect(bool benableavx512)
405		{
406		(void)benableavx512;
407		int flags = 0;
408
409		#ifdef ENABLE_ASSEMBLY
410		flags = aarch64_cpu_detect();
411		#endif
412
413		return flags;
414		}
415
416		#elif X265_ARCH_RISCV64
417		#include "riscv64/cpu.h"
418
419		uint32_t cpu_detect(bool benableavx512)
420		{
421		(void)benableavx512;
422		uint32_t flags = 0;
423
424		#ifdef ENABLE_ASSEMBLY
425		flags = riscv64_cpu_detect();
426		#endif
427
428		return flags;
429		}
430
431		#elif X265_ARCH_POWER8
432
433		uint32_t cpu_detect(bool benableavx512)
434		{
435		#if HAVE_ALTIVEC
436		return X265_CPU_ALTIVEC;
437		#else
438		return 0;
439		#endif
440		}
441
442		#else // if X265_ARCH_POWER8
443
444		uint32_t cpu_detect(bool benableavx512)
445		{
446		return 0;
447		}
448
449		#endif // if X265_ARCH_X86
450		}
451