/src/x265/source/common/primitives.cpp

Source (jump to first uncovered line)
/*****************************************************************************
 * Copyright (C) 2013-2020 MulticoreWare, Inc
 *
 * Authors: Steve Borho <steve@borho.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "common.h"
#include "primitives.h"

namespace X265_NS {
// x265 private namespace

extern const uint8_t lumaPartitionMapTable[] =
{
//  4          8          12          16          20  24          28  32          36  40  44  48          52  56  60  64
    LUMA_4x4,  LUMA_4x8,  255,        LUMA_4x16,  255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 4
    LUMA_8x4,  LUMA_8x8,  255,        LUMA_8x16,  255, 255,        255, LUMA_8x32,  255, 255, 255, 255,        255, 255, 255, 255,        // 8
    255,        255,      255,        LUMA_12x16, 255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 12
    LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255,        255, LUMA_16x32, 255, 255, 255, 255,        255, 255, 255, LUMA_16x64, // 16
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 20
    255,        255,      255,        255,        255, 255,        255, LUMA_24x32, 255, 255, 255, 255,        255, 255, 255, 255,        // 24
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 28
    255,        LUMA_32x8, 255,       LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255,        255, 255, 255, LUMA_32x64, // 32
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 36
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 40
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 44
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, LUMA_48x64, // 48
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 52
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 56
    255,        255,      255,        255,        255, 255,        255, 255,        255, 255, 255, 255,        255, 255, 255, 255,        // 60
    255,        255,      255,        LUMA_64x16, 255, 255,        255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64  // 64
};

/* the "authoritative" set of encoder primitives */
EncoderPrimitives primitives;

void setupPixelPrimitives_c(EncoderPrimitives &p);
void setupDCTPrimitives_c(EncoderPrimitives &p);
void setupFilterPrimitives_c(EncoderPrimitives &p);
void setupIntraPrimitives_c(EncoderPrimitives &p);
void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
void setupSaoPrimitives_c(EncoderPrimitives &p);
void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
void setupLowPassPrimitives_c(EncoderPrimitives& p);

void setupCPrimitives(EncoderPrimitives &p)
{
    setupPixelPrimitives_c(p);      // pixel.cpp
    setupDCTPrimitives_c(p);        // dct.cpp
    setupLowPassPrimitives_c(p);    // lowpassdct.cpp
    setupFilterPrimitives_c(p);     // ipfilter.cpp
    setupIntraPrimitives_c(p);      // intrapred.cpp
    setupLoopFilterPrimitives_c(p); // loopfilter.cpp
    setupSaoPrimitives_c(p);        // sao.cpp
    setupSeaIntegralPrimitives_c(p);  // framefilter.cpp
}

void enableLowpassDCTPrimitives(EncoderPrimitives &p)
{
    // update copies of the standard dct transform
    p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct;
    p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct;
    p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct;
    p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct;

    // replace active dct by lowpass dct for high dct transforms
    p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct;
    p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct;
}

void setupAliasPrimitives(EncoderPrimitives &p)
{
#if HIGH_BIT_DEPTH
    /* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
    for (int i = 0; i < NUM_CU_SIZES; i++)
    {
#if !defined(X265_ARCH_ARM64)
        p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss;
#endif

        p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
        p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
        p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp;

        p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
        p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
        p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;

        p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
        p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
        p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
    }
#endif

    /* alias chroma 4:4:4 from luma primitives (all but chroma filters) */

    p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL;

    for (int i = 0; i < NUM_PU_SIZES; i++)
    {
        p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
        p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED]  = p.pu[i].addAvg[NONALIGNED];
        p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
        p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
        p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED]     = p.pu[i].convert_p2s[NONALIGNED];
        p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
    }

    for (int i = 0; i < NUM_CU_SIZES; i++)
    {
        p.chroma[X265_CSP_I444].cu[i].sa8d    = p.cu[i].sa8d;
        p.chroma[X265_CSP_I444].cu[i].sse_pp  = p.cu[i].sse_pp;
        p.chroma[X265_CSP_I444].cu[i].sub_ps  = p.cu[i].sub_ps;
        p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED]  = p.cu[i].add_ps[NONALIGNED];
        p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
        p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
        p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
        p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
    }

    p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd;

    /* Chroma PU can often use luma satd primitives */
    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = p.pu[LUMA_4x4].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = p.pu[LUMA_8x8].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd;

    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = p.pu[LUMA_8x4].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = p.pu[LUMA_4x8].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = p.pu[LUMA_16x8].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = p.pu[LUMA_8x16].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd;

    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = p.pu[LUMA_16x4].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = p.pu[LUMA_4x16].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = p.pu[LUMA_32x8].satd;
    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = p.pu[LUMA_8x32].satd;

    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = p.pu[LUMA_4x8].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = p.pu[LUMA_8x16].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd;

    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = p.pu[LUMA_4x4].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = p.pu[LUMA_8x8].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = p.pu[LUMA_4x16].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = p.pu[LUMA_8x32].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd;

    //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12]  = satd4<8, 12>;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd  = p.pu[LUMA_8x4].satd;
    //p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>;
    //p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd;
    //p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32]  = satd4<4, 32>;
    //p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>;
    //p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>;
    p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd;
    //p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64]  = satd8<8, 64>;

    p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d;

    p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL;
    p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd;

    /* alias CU copy_pp from square PU copy_pp */
    for (int i = 0; i < NUM_CU_SIZES; i++)
    {
        p.cu[i].copy_pp = p.pu[i].copy_pp;

        for (int c = 0; c < X265_CSP_COUNT; c++)
            p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp;
    }

    p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp;
    p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp;

    p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
}

void x265_report_simd(x265_param* param)
{
    if (param->logLevel >= X265_LOG_INFO)
    {
        int cpuid = param->cpuid;

        char buf[1000];
        char *p = buf + snprintf(buf, sizeof(buf), "using cpu capabilities:");
        char *none = p;
        for (int i = 0; X265_NS::cpu_names[i].flags; i++)
        {
            if (!strcmp(X265_NS::cpu_names[i].name, "SSE")
                && (cpuid & X265_CPU_SSE2))
                continue;
            if (!strcmp(X265_NS::cpu_names[i].name, "SSE2")
                && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW)))
                continue;
            if (!strcmp(X265_NS::cpu_names[i].name, "SSE3")
                && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
                continue;
            if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1")
                && (cpuid & X265_CPU_SSE42))
                continue;
            if (!strcmp(X265_NS::cpu_names[i].name, "BMI1")
                && (cpuid & X265_CPU_BMI2))
                continue;
            if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags
                && (!i || X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags))
                p += snprintf(p, sizeof(buf) - (p - buf), " %s", X265_NS::cpu_names[i].name);
        }

        if (p == none)
            snprintf(p, sizeof(buf) - (p - buf), " none!");
        x265_log(param, X265_LOG_INFO, "%s\n", buf);
    }
}

void x265_setup_primitives(x265_param *param)
{
    if (!primitives.pu[0].sad)
    {
        setupCPrimitives(primitives);

        /* We do not want the encoder to use the un-optimized intra all-angles
         * C references. It is better to call the individual angle functions
         * instead. We must check for NULL before using this primitive */
        for (int i = 0; i < NUM_TR_SIZE; i++)
            primitives.cu[i].intra_pred_allangs = NULL;

#if ENABLE_ASSEMBLY
#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
        setupIntrinsicPrimitives(primitives, param->cpuid);
#endif
        setupAssemblyPrimitives(primitives, param->cpuid);
#endif
#if HAVE_ALTIVEC
        if (param->cpuid & X265_CPU_ALTIVEC)
        {
            setupPixelPrimitives_altivec(primitives);       // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
            setupDCTPrimitives_altivec(primitives);         // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
            setupFilterPrimitives_altivec(primitives);      // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
            setupIntraPrimitives_altivec(primitives);       // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions
        }
#endif

        setupAliasPrimitives(primitives);

        if (param->bLowPassDct)
        {
            enableLowpassDCTPrimitives(primitives); 
        }
    }

    x265_report_simd(param);
}
}

#if ENABLE_ASSEMBLY && X265_ARCH_X86
/* these functions are implemented in assembly. When assembly is not being
 * compiled, they are unnecessary and can be NOPs */
#else
extern "C" {
int PFX(cpu_cpuid_test)(void) { return 0; }
void PFX(cpu_emms)(void) {}
void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}

#if X265_ARCH_ARM == 0
void PFX(cpu_neon_test)(void) {}
int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
#endif // X265_ARCH_ARM
}
#endif

Coverage Report

Created: 2025-07-23 08:18

Line	Count	Source (jump to first uncovered line)
1		/*****************************************************************************
2		* Copyright (C) 2013-2020 MulticoreWare, Inc
3		*
4		* Authors: Steve Borho <steve@borho.org>
5		*
6		* This program is free software; you can redistribute it and/or modify
7		* it under the terms of the GNU General Public License as published by
8		* the Free Software Foundation; either version 2 of the License, or
9		* (at your option) any later version.
10		*
11		* This program is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		* GNU General Public License for more details.
15		*
16		* You should have received a copy of the GNU General Public License
17		* along with this program; if not, write to the Free Software
18		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19		*
20		* This program is also available under a commercial proprietary license.
21		* For more information, contact us at license @ x265.com.
22		*****************************************************************************/
23
24		#include "common.h"
25		#include "primitives.h"
26
27		namespace X265_NS {
28		// x265 private namespace
29
30		extern const uint8_t lumaPartitionMapTable[] =
31		{
32		// 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64
33		LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4
34		LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8
35		255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12
36		LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16
37		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20
38		255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24
39		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28
40		255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32
41		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36
42		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40
43		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44
44		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48
45		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52
46		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56
47		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60
48		255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64
49		};
50
51		/* the "authoritative" set of encoder primitives */
52		EncoderPrimitives primitives;
53
54		void setupPixelPrimitives_c(EncoderPrimitives &p);
55		void setupDCTPrimitives_c(EncoderPrimitives &p);
56		void setupFilterPrimitives_c(EncoderPrimitives &p);
57		void setupIntraPrimitives_c(EncoderPrimitives &p);
58		void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
59		void setupSaoPrimitives_c(EncoderPrimitives &p);
60		void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
61		void setupLowPassPrimitives_c(EncoderPrimitives& p);
62
63		void setupCPrimitives(EncoderPrimitives &p)
64	0	{
65	0	setupPixelPrimitives_c(p); // pixel.cpp
66	0	setupDCTPrimitives_c(p); // dct.cpp
67	0	setupLowPassPrimitives_c(p); // lowpassdct.cpp
68	0	setupFilterPrimitives_c(p); // ipfilter.cpp
69	0	setupIntraPrimitives_c(p); // intrapred.cpp
70	0	setupLoopFilterPrimitives_c(p); // loopfilter.cpp
71	0	setupSaoPrimitives_c(p); // sao.cpp
72	0	setupSeaIntegralPrimitives_c(p); // framefilter.cpp
73	0	}
74
75		void enableLowpassDCTPrimitives(EncoderPrimitives &p)
76	0	{
77		// update copies of the standard dct transform
78	0	p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct;
79	0	p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct;
80	0	p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct;
81	0	p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct;
82
83		// replace active dct by lowpass dct for high dct transforms
84	0	p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct;
85	0	p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct;
86	0	}
87
88		void setupAliasPrimitives(EncoderPrimitives &p)
89	0	{
90		#if HIGH_BIT_DEPTH
91		/* at HIGH_BIT_DEPTH, pixel == short so we can alias many primitives */
92		for (int i = 0; i < NUM_CU_SIZES; i++)
93		{
94		#if !defined(X265_ARCH_ARM64)
95		p.cu[i].sse_pp = (pixel_sse_t)p.cu[i].sse_ss;
96		#endif
97
98		p.cu[i].copy_ps = (copy_ps_t)p.pu[i].copy_pp;
99		p.cu[i].copy_sp = (copy_sp_t)p.pu[i].copy_pp;
100		p.cu[i].copy_ss = (copy_ss_t)p.pu[i].copy_pp;
101
102		p.chroma[X265_CSP_I420].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
103		p.chroma[X265_CSP_I420].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
104		p.chroma[X265_CSP_I420].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I420].pu[i].copy_pp;
105
106		p.chroma[X265_CSP_I422].cu[i].copy_ps = (copy_ps_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
107		p.chroma[X265_CSP_I422].cu[i].copy_sp = (copy_sp_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
108		p.chroma[X265_CSP_I422].cu[i].copy_ss = (copy_ss_t)p.chroma[X265_CSP_I422].pu[i].copy_pp;
109		}
110		#endif
111
112		/* alias chroma 4:4:4 from luma primitives (all but chroma filters) */
113
114	0	p.chroma[X265_CSP_I444].cu[BLOCK_4x4].sa8d = NULL;
115
116	0	for (int i = 0; i < NUM_PU_SIZES; i++)
117	0	{
118	0	p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
119	0	p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED] = p.pu[i].addAvg[NONALIGNED];
120	0	p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
121	0	p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
122	0	p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED] = p.pu[i].convert_p2s[NONALIGNED];
123	0	p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
124	0	}
125
126	0	for (int i = 0; i < NUM_CU_SIZES; i++)
127	0	{
128	0	p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d;
129	0	p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp;
130	0	p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps;
131	0	p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED] = p.cu[i].add_ps[NONALIGNED];
132	0	p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
133	0	p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
134	0	p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
135	0	p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
136	0	}
137
138	0	p.cu[BLOCK_4x4].sa8d = p.pu[LUMA_4x4].satd;
139
140		/* Chroma PU can often use luma satd primitives */
141	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = p.pu[LUMA_4x4].satd;
142	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = p.pu[LUMA_8x8].satd;
143	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = p.pu[LUMA_16x16].satd;
144	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = p.pu[LUMA_32x32].satd;
145
146	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = p.pu[LUMA_8x4].satd;
147	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = p.pu[LUMA_4x8].satd;
148	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = p.pu[LUMA_16x8].satd;
149	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = p.pu[LUMA_8x16].satd;
150	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = p.pu[LUMA_32x16].satd;
151	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = p.pu[LUMA_16x32].satd;
152
153	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = p.pu[LUMA_16x12].satd;
154	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = p.pu[LUMA_12x16].satd;
155	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = p.pu[LUMA_16x4].satd;
156	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = p.pu[LUMA_4x16].satd;
157	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = p.pu[LUMA_32x24].satd;
158	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = p.pu[LUMA_24x32].satd;
159	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = p.pu[LUMA_32x8].satd;
160	0	p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = p.pu[LUMA_8x32].satd;
161
162	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = p.pu[LUMA_4x8].satd;
163	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = p.pu[LUMA_8x16].satd;
164	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = p.pu[LUMA_16x32].satd;
165	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = p.pu[LUMA_32x64].satd;
166
167	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = p.pu[LUMA_4x4].satd;
168	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = p.pu[LUMA_8x8].satd;
169	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = p.pu[LUMA_4x16].satd;
170	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = p.pu[LUMA_16x16].satd;
171	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = p.pu[LUMA_8x32].satd;
172	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = p.pu[LUMA_32x32].satd;
173	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = p.pu[LUMA_16x64].satd;
174
175		//p.chroma[X265_CSP_I422].satd[CHROMA_422_8x12] = satd4<8, 12>;
176	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = p.pu[LUMA_8x4].satd;
177		//p.chroma[X265_CSP_I422].satd[CHROMA_422_16x24] = satd8<16, 24>;
178		//p.chroma[X265_CSP_I422].satd[CHROMA_422_12x32] = satd4<12, 32>;
179	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = p.pu[LUMA_16x8].satd;
180		//p.chroma[X265_CSP_I422].satd[CHROMA_422_4x32] = satd4<4, 32>;
181		//p.chroma[X265_CSP_I422].satd[CHROMA_422_32x48] = satd8<32, 48>;
182		//p.chroma[X265_CSP_I422].satd[CHROMA_422_24x64] = satd8<24, 64>;
183	0	p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = p.pu[LUMA_32x16].satd;
184		//p.chroma[X265_CSP_I422].satd[CHROMA_422_8x64] = satd8<8, 64>;
185
186	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sa8d = NULL;
187	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = p.pu[LUMA_4x4].satd;
188	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = p.cu[BLOCK_8x8].sa8d;
189	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = p.cu[BLOCK_16x16].sa8d;
190	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = p.cu[BLOCK_32x32].sa8d;
191
192	0	p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sa8d = NULL;
193	0	p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sa8d = p.pu[LUMA_4x8].satd;
194
195		/* alias CU copy_pp from square PU copy_pp */
196	0	for (int i = 0; i < NUM_CU_SIZES; i++)
197	0	{
198	0	p.cu[i].copy_pp = p.pu[i].copy_pp;
199
200	0	for (int c = 0; c < X265_CSP_COUNT; c++)
201	0	p.chroma[c].cu[i].copy_pp = p.chroma[c].pu[i].copy_pp;
202	0	}
203
204	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_2x2].sse_pp = NULL;
205	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = p.cu[BLOCK_4x4].sse_pp;
206	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = p.cu[BLOCK_8x8].sse_pp;
207	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = p.cu[BLOCK_16x16].sse_pp;
208	0	p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = p.cu[BLOCK_32x32].sse_pp;
209
210	0	p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
211	0	}
212
213		void x265_report_simd(x265_param* param)
214	0	{
215	0	if (param->logLevel >= X265_LOG_INFO)
216	0	{
217	0	int cpuid = param->cpuid;
218
219	0	char buf[1000];
220	0	char *p = buf + snprintf(buf, sizeof(buf), "using cpu capabilities:");
221	0	char *none = p;
222	0	for (int i = 0; X265_NS::cpu_names[i].flags; i++)
223	0	{
224	0	if (!strcmp(X265_NS::cpu_names[i].name, "SSE")
225	0	&& (cpuid & X265_CPU_SSE2))
226	0	continue;
227	0	if (!strcmp(X265_NS::cpu_names[i].name, "SSE2")
228	0	&& (cpuid & (X265_CPU_SSE2_IS_FAST \| X265_CPU_SSE2_IS_SLOW)))
229	0	continue;
230	0	if (!strcmp(X265_NS::cpu_names[i].name, "SSE3")
231	0	&& (cpuid & X265_CPU_SSSE3 \|\| !(cpuid & X265_CPU_CACHELINE_64)))
232	0	continue;
233	0	if (!strcmp(X265_NS::cpu_names[i].name, "SSE4.1")
234	0	&& (cpuid & X265_CPU_SSE42))
235	0	continue;
236	0	if (!strcmp(X265_NS::cpu_names[i].name, "BMI1")
237	0	&& (cpuid & X265_CPU_BMI2))
238	0	continue;
239	0	if ((cpuid & X265_NS::cpu_names[i].flags) == X265_NS::cpu_names[i].flags
240	0	&& (!i \|\| X265_NS::cpu_names[i].flags != X265_NS::cpu_names[i - 1].flags))
241	0	p += snprintf(p, sizeof(buf) - (p - buf), " %s", X265_NS::cpu_names[i].name);
242	0	}
243
244	0	if (p == none)
245	0	snprintf(p, sizeof(buf) - (p - buf), " none!");
246	0	x265_log(param, X265_LOG_INFO, "%s\n", buf);
247	0	}
248	0	}
249
250		void x265_setup_primitives(x265_param *param)
251	0	{
252	0	if (!primitives.pu[0].sad)
253	0	{
254	0	setupCPrimitives(primitives);
255
256		/* We do not want the encoder to use the un-optimized intra all-angles
257		* C references. It is better to call the individual angle functions
258		* instead. We must check for NULL before using this primitive */
259	0	for (int i = 0; i < NUM_TR_SIZE; i++)
260	0	primitives.cu[i].intra_pred_allangs = NULL;
261
262		#if ENABLE_ASSEMBLY
263		#if defined(X265_ARCH_X86) \|\| defined(X265_ARCH_ARM64)
264		setupIntrinsicPrimitives(primitives, param->cpuid);
265		#endif
266		setupAssemblyPrimitives(primitives, param->cpuid);
267		#endif
268		#if HAVE_ALTIVEC
269		if (param->cpuid & X265_CPU_ALTIVEC)
270		{
271		setupPixelPrimitives_altivec(primitives); // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
272		setupDCTPrimitives_altivec(primitives); // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
273		setupFilterPrimitives_altivec(primitives); // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
274		setupIntraPrimitives_altivec(primitives); // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions
275		}
276		#endif
277
278	0	setupAliasPrimitives(primitives);
279
280	0	if (param->bLowPassDct)
281	0	{
282	0	enableLowpassDCTPrimitives(primitives);
283	0	}
284	0	}
285
286	0	x265_report_simd(param);
287	0	}
288		}
289
290		#if ENABLE_ASSEMBLY && X265_ARCH_X86
291		/* these functions are implemented in assembly. When assembly is not being
292		* compiled, they are unnecessary and can be NOPs */
293		#else
294		extern "C" {
295	0	int PFX(cpu_cpuid_test)(void) { return 0; }
296	0	void PFX(cpu_emms)(void) {}
297	0	void PFX(cpu_cpuid)(uint32_t, uint32_t eax, uint32_t , uint32_t , uint32_t ) { *eax = 0; }
298	0	void PFX(cpu_xgetbv)(uint32_t, uint32_t , uint32_t ) {}
299
300		#if X265_ARCH_ARM == 0
301	0	void PFX(cpu_neon_test)(void) {}
302	0	int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
303		#endif // X265_ARCH_ARM
304		}
305		#endif