/src/x265/source/encoder/weightPrediction.cpp

Source (jump to first uncovered line)
/*****************************************************************************
 * Copyright (C) 2013-2020 MulticoreWare, Inc
 *
 * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
 *         Steve Borho <steve@borho.org>
 *         Kavitha Sampas <kavitha@multicorewareinc.com>
 *         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "common.h"
#include "frame.h"
#include "picyuv.h"
#include "lowres.h"
#include "slice.h"
#include "mv.h"
#include "bitstream.h"
#include "threading.h"

using namespace X265_NS;
namespace {
struct Cache
{
    const int * intraCost;
    int         numPredDir;
    int         csp;
    int         hshift;
    int         vshift;
    int         lowresWidthInCU;
    int         lowresHeightInCU;
};

int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
{
    /* 4 times higher, because chroma is analyzed at full resolution. */
    if (bChroma)
        lambda *= 4;
    int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
    return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
}

/* make a motion compensated copy of lowres ref into mcout with the same stride.
 * The borders of mcout are not extended */
void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
{
    intptr_t stride = ref.lumaStride;
    const int mvshift = 1 << 2;
    const int cuSize = 8;
    MV mvmin, mvmax;

    int cu = 0;

    for (int y = 0; y < ref.lines; y += cuSize)
    {
        intptr_t pixoff = y * stride;
        mvmin.y = (int32_t)((-y - 8) * mvshift);
        mvmax.y = (int32_t)((ref.lines - y - 1 + 8) * mvshift);

        for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
        {
            ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
            intptr_t bstride = 8;
            mvmin.x = (int32_t)((-x - 8) * mvshift);
            mvmax.x = (int32_t)((ref.width - x - 1 + 8) * mvshift);

            /* clip MV to available pixels */
            MV mv = mvs[cu];
            mv = mv.clipped(mvmin, mvmax);
            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
            primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
        }
    }
}

/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
 * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
void mcChroma(pixel *      mcout,
              pixel *      src,
              intptr_t     stride,
              const MV *   mvs,
              const Cache& cache,
              int          height,
              int          width)
{
    /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
     * luma blocks. We have to adapt block size to chroma csp */
    int csp = cache.csp;
    int bw = 16 >> cache.hshift;
    int bh = 16 >> cache.vshift;
    const int mvshift = 1 << 2;
    MV mvmin, mvmax;

    for (int y = 0; y < height; y += bh)
    {
        /* note: lowres block count per row might be different from chroma block
         * count per row because of rounding issues, so be very careful with indexing
         * into the lowres structures */
        int cu = y * cache.lowresWidthInCU;
        intptr_t pixoff = y * stride;
        mvmin.y = (int32_t)((-y - 8) * mvshift);
        mvmax.y = (int32_t)((height - y - 1 + 8) * mvshift);

        for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
        {
            if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
            {
                MV mv = mvs[cu]; // lowres MV
                mv <<= 1;        // fullres MV
                mv.x >>= cache.hshift;
                mv.y >>= cache.vshift;

                /* clip MV to available pixels */
                mvmin.x = (int32_t)((-x - 8) * mvshift);
                mvmax.x = (int32_t)((width - x - 1 + 8) * mvshift);
                mv = mv.clipped(mvmin, mvmax);

                intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
                pixel *temp = src + pixoff + fpeloffset;

                int xFrac = mv.x & 7;
                int yFrac = mv.y & 7;
                if (!(yFrac | xFrac))
                {
                    primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
                }
                else if (!yFrac)
                {
                    primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
                }
                else if (!xFrac)
                {
                    primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
                }
                else
                {
                    ALIGN_VAR_16(int16_t, immed[16 * (16 + NTAPS_CHROMA - 1)]);
                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, immed, bw, xFrac, 1);
                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(immed + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
                }
            }
            else
            {
                primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
            }
        }
    }
}

/* Measure sum of 8x8 satd costs between source frame and reference
 * frame (potentially weighted, potentially motion compensated). We
 * always use source images for this analysis since reference recon
 * pixels have unreliable availability */
uint32_t weightCost(pixel *         fenc,
                    pixel *         ref,
                    pixel *         weightTemp,
                    intptr_t        stride,
                    const Cache &   cache,
                    int             width,
                    int             height,
                    WeightParam *   w,
                    bool            bLuma)
{
    if (w)
    {
        /* make a weighted copy of the reference plane */
        int offset = w->inputOffset << (X265_DEPTH - 8);
        int weight = w->inputWeight;
        int denom = w->log2WeightDenom;
        int round = denom ? 1 << (denom - 1) : 0;
        int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
        int pwidth = ((width + 31) >> 5) << 5;
        primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
                             weight, round << correction, denom + correction, offset);
        ref = weightTemp;
    }

    uint32_t cost = 0;
    pixel *f = fenc, *r = ref;

    if (bLuma)
    {
        int cu = 0;
        for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
        {
            for (int x = 0; x < width; x += 8, cu++)
            {
                int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
                cost += X265_MIN(cmp, cache.intraCost[cu]);
            }
        }
    }
    else if (cache.csp == X265_CSP_I444)
        for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
            for (int x = 0; x < width; x += 16)
                cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
    else
        for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
            for (int x = 0; x < width; x += 8)
                cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);

    return cost;
}
}

namespace X265_NS {
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
{
    WeightParam wp[2][MAX_NUM_REF][3];
    PicYuv *fencPic = frame.m_fencPic;
    Lowres& fenc    = frame.m_lowres;

    Cache cache;

    memset(&cache, 0, sizeof(cache));
    cache.intraCost = fenc.intraCost;
    cache.numPredDir = slice.isInterP() ? 1 : 2;
    cache.lowresWidthInCU = fenc.width >> 3;
    cache.lowresHeightInCU = fenc.lines >> 3;
    cache.csp = param.internalCsp;
    cache.hshift = CHROMA_H_SHIFT(cache.csp);
    cache.vshift = CHROMA_V_SHIFT(cache.csp);

    /* Use single allocation for motion compensated ref and weight buffers */
    pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
    if (!mcbuf)
    {
        slice.disableWeights();
        return;
    }
    pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;

    int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
    int curPoc = slice.m_poc;
    const float epsilon = 1.f / 128.f;

    int chromaDenom, lumaDenom, denom;
    chromaDenom = lumaDenom = 7;
    int numpixels[3];
    int w16 = ((fencPic->m_picWidth  + 15) >> 4) << 4;
    int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
    numpixels[0] = w16 * h16;
    numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);

    for (int list = 0; list < cache.numPredDir; list++)
    {
        WeightParam *weights = wp[list][0];
        Frame *refFrame = slice.m_refFrameList[list][0];
        Lowres& refLowres = refFrame->m_lowres;
        int diffPoc = abs(curPoc - refFrame->m_poc);

        /* prepare estimates */
        float guessScale[3], fencMean[3], refMean[3];
        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
        {
            SET_WEIGHT(weights[plane], false, 1, 0, 0);
            uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
            uint64_t refVar  = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
            guessScale[plane] = sqrt((float)fencVar / refVar);
            fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
            refMean[plane]  = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
        }

        /* make sure both our scale factors fit */
        while (!list && chromaDenom > 0)
        {
            float thresh = 127.f / (1 << chromaDenom);
            if (guessScale[1] < thresh && guessScale[2] < thresh)
                break;
            chromaDenom--;
        }

        SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
        SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);

        MV *mvs = NULL;

        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
        {
            denom = plane ? chromaDenom : lumaDenom;
            if (plane && !weights[0].wtPresent)
                break;

            /* Early termination */
            x265_emms();
            if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
            {
                SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
                continue;
            }

            if (plane)
            {
                int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
                if (scale > 127)
                    continue;
                weights[plane].inputWeight = scale;
            }
            else
            {
                weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
            }

            int mindenom = weights[plane].log2WeightDenom;
            int minscale = weights[plane].inputWeight;
            int minoff = 0;

            if (!plane && diffPoc <= param.bframes + 1)
            {
                mvs = fenc.lowresMvs[list][diffPoc];

                /* test whether this motion search was performed by lookahead */
                if (mvs[0].x != 0x7FFF)
                {
                    /* reference chroma planes must be extended prior to being
                     * used as motion compensation sources */
                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400 && frame.m_fencPic->m_picCsp != X265_CSP_I400)
                    {
                        refFrame->m_bChromaExtended = true;
                        PicYuv *refPic = refFrame->m_fencPic;
                        int width = refPic->m_picWidth >> cache.hshift;
                        int height = refPic->m_picHeight >> cache.vshift;
                        extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
                        extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
                    }
                }
                else
                    mvs = 0;
            }

            /* prepare inputs to weight analysis */
            pixel *orig;
            pixel *fref;
            intptr_t stride;
            int    width, height;
            switch (plane)
            {
            case 0:
                orig = fenc.lowresPlane[0];
                stride = fenc.lumaStride;
                width = fenc.width;
                height = fenc.lines;
                fref = refLowres.lowresPlane[0];
                if (mvs)
                {
                    mcLuma(mcbuf, refLowres, mvs);
                    fref = mcbuf;
                }
                break;

            case 1:
                orig = fencPic->m_picOrg[1];
                stride = fencPic->m_strideC;
                fref = refFrame->m_fencPic->m_picOrg[1];

                /* Clamp the chroma dimensions to the nearest multiple of
                 * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
                 * blocks and weightCost measures 8x8 blocks. This
                 * potentially ignores some edge pixels, but simplifies the
                 * logic and prevents reading uninitialized pixels. Lowres
                 * planes are border extended and require no clamping. */
                width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
                height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
                if (mvs)
                {
                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
                    fref = mcbuf;
                }
                break;

            case 2:
                orig = fencPic->m_picOrg[2];
                stride = fencPic->m_strideC;
                fref = refFrame->m_fencPic->m_picOrg[2];
                width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
                height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
                if (mvs)
                {
                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
                    fref = mcbuf;
                }
                break;

            default:
                slice.disableWeights();
                X265_FREE(mcbuf);
                return;
            }

            uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
            if (!origscore)
            {
                SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
                continue;
            }

            uint32_t minscore = origscore;
            bool bFound = false;

            /* x264 uses a table lookup here, selecting search range based on preset */
            static const int scaleDist = 4;
            static const int offsetDist = 2;

            int startScale = x265_clip3(0, 127, minscale - scaleDist);
            int endScale   = x265_clip3(0, 127, minscale + scaleDist);
            for (int scale = startScale; scale <= endScale; scale++)
            {
                int deltaWeight = scale - (1 << mindenom);
                if (deltaWeight > 127 || deltaWeight <= -128)
                    continue;

                x265_emms();
                int curScale = scale;
                int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
                if (curOffset < -128 || curOffset > 127)
                {
                    /* Rescale considering the constraints on curOffset. We do it in this order
                     * because scale has a much wider range than offset (because of denom), so
                     * it should almost never need to be clamped. */
                    curOffset = x265_clip3(-128, 127, curOffset);
                    curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
                    curScale = x265_clip3(0, 127, curScale);
                }

                int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
                int endOffset   = x265_clip3(-128, 127, curOffset + offsetDist);
                for (int off = startOffset; off <= endOffset; off++)
                {
                    WeightParam wsp;
                    SET_WEIGHT(wsp, true, curScale, mindenom, off);
                    uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
                                 sliceHeaderCost(&wsp, lambda, !!plane);
                    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);

                    /* Don't check any more offsets if the previous one had a lower cost than the current one */
                    if (minoff == startOffset && off != startOffset)
                        break;
                }
            }

            /* Use a smaller luma denominator if possible */
            if (!(plane || list))
            {
                if (mindenom > 0 && !(minscale & 1))
                {
                    unsigned long idx;
                    CTZ(idx, minscale);
                    int shift = X265_MIN((int)idx, mindenom);
                    mindenom -= shift;
                    minscale >>= shift;
                }
            }

            if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
            {
                SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
            }
            else
            {
                SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
            }
        }

        if (weights[0].wtPresent)
        {
            // Make sure both chroma channels match
            if (weights[1].wtPresent != weights[2].wtPresent)
            {
                if (weights[1].wtPresent)
                    weights[2] = weights[1];
                else
                    weights[1] = weights[2];
            }
        }

        lumaDenom = weights[0].log2WeightDenom;
        chromaDenom = weights[1].log2WeightDenom;

        /* reset weight states */
        for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
        {
            SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
            SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
            SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
        }
    }

    X265_FREE(mcbuf);

    memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);

    if (param.logLevel >= X265_LOG_FULL)
    {
        char buf[1024];
        int p = 0;
        bool bWeighted = false;

        p = sprintf(buf, "poc: %d weights:", slice.m_poc);
        int numPredDir = slice.isInterP() ? 1 : 2;
        for (int list = 0; list < numPredDir; list++)
        {
            WeightParam* w = &wp[list][0][0];
            if (w[0].wtPresent || w[1].wtPresent || w[2].wtPresent)
            {
                bWeighted = true;
                p += sprintf(buf + p, " [L%d:R0 ", list);
                if (w[0].wtPresent)
                    p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
                if (w[1].wtPresent)
                    p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
                if (w[2].wtPresent)
                    p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
                p += sprintf(buf + p, "]");
            }
        }

        if (bWeighted)
        {
            if (p < 80) // pad with spaces to ensure progress line overwritten
                sprintf(buf + p, "%*s", 80 - p, " ");
            x265_log(&param, X265_LOG_FULL, "%s\n", buf);
        }
    }
}
}

Coverage Report

Created: 2022-08-24 06:17

Line	Count	Source (jump to first uncovered line)
1		/*****************************************************************************
2		* Copyright (C) 2013-2020 MulticoreWare, Inc
3		*
4		* Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
5		* Steve Borho <steve@borho.org>
6		* Kavitha Sampas <kavitha@multicorewareinc.com>
7		* Min Chen <chenm003@163.com>
8		*
9		* This program is free software; you can redistribute it and/or modify
10		* it under the terms of the GNU General Public License as published by
11		* the Free Software Foundation; either version 2 of the License, or
12		* (at your option) any later version.
13		*
14		* This program is distributed in the hope that it will be useful,
15		* but WITHOUT ANY WARRANTY; without even the implied warranty of
16		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17		* GNU General Public License for more details.
18		*
19		* You should have received a copy of the GNU General Public License
20		* along with this program; if not, write to the Free Software
21		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22		*
23		* This program is also available under a commercial proprietary license.
24		* For more information, contact us at license @ x265.com.
25		*****************************************************************************/
26
27		#include "common.h"
28		#include "frame.h"
29		#include "picyuv.h"
30		#include "lowres.h"
31		#include "slice.h"
32		#include "mv.h"
33		#include "bitstream.h"
34		#include "threading.h"
35
36		using namespace X265_NS;
37		namespace {
38		struct Cache
39		{
40		const int * intraCost;
41		int numPredDir;
42		int csp;
43		int hshift;
44		int vshift;
45		int lowresWidthInCU;
46		int lowresHeightInCU;
47		};
48
49		int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
50	0	{
51		/* 4 times higher, because chroma is analyzed at full resolution. */
52	0	if (bChroma)
53	0	lambda *= 4;
54	0	int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
55	0	return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
56	0	}
57
58		/* make a motion compensated copy of lowres ref into mcout with the same stride.
59		* The borders of mcout are not extended */
60		void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
61	0	{
62	0	intptr_t stride = ref.lumaStride;
63	0	const int mvshift = 1 << 2;
64	0	const int cuSize = 8;
65	0	MV mvmin, mvmax;
66
67	0	int cu = 0;
68
69	0	for (int y = 0; y < ref.lines; y += cuSize)
70	0	{
71	0	intptr_t pixoff = y * stride;
72	0	mvmin.y = (int32_t)((-y - 8) * mvshift);
73	0	mvmax.y = (int32_t)((ref.lines - y - 1 + 8) * mvshift);
74
75	0	for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
76	0	{
77	0	ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
78	0	intptr_t bstride = 8;
79	0	mvmin.x = (int32_t)((-x - 8) * mvshift);
80	0	mvmax.x = (int32_t)((ref.width - x - 1 + 8) * mvshift);
81
82		/* clip MV to available pixels */
83	0	MV mv = mvs[cu];
84	0	mv = mv.clipped(mvmin, mvmax);
85	0	pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
86	0	primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
87	0	}
88	0	}
89	0	}
90
91		/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
92		* if a block had cheaper lowres cost as intra, we treat it as MV 0 */
93		void mcChroma(pixel * mcout,
94		pixel * src,
95		intptr_t stride,
96		const MV * mvs,
97		const Cache& cache,
98		int height,
99		int width)
100	0	{
101		/* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
102		* luma blocks. We have to adapt block size to chroma csp */
103	0	int csp = cache.csp;
104	0	int bw = 16 >> cache.hshift;
105	0	int bh = 16 >> cache.vshift;
106	0	const int mvshift = 1 << 2;
107	0	MV mvmin, mvmax;
108
109	0	for (int y = 0; y < height; y += bh)
110	0	{
111		/* note: lowres block count per row might be different from chroma block
112		* count per row because of rounding issues, so be very careful with indexing
113		* into the lowres structures */
114	0	int cu = y * cache.lowresWidthInCU;
115	0	intptr_t pixoff = y * stride;
116	0	mvmin.y = (int32_t)((-y - 8) * mvshift);
117	0	mvmax.y = (int32_t)((height - y - 1 + 8) * mvshift);
118
119	0	for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
120	0	{
121	0	if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
122	0	{
123	0	MV mv = mvs[cu]; // lowres MV
124	0	mv <<= 1; // fullres MV
125	0	mv.x >>= cache.hshift;
126	0	mv.y >>= cache.vshift;
127
128		/* clip MV to available pixels */
129	0	mvmin.x = (int32_t)((-x - 8) * mvshift);
130	0	mvmax.x = (int32_t)((width - x - 1 + 8) * mvshift);
131	0	mv = mv.clipped(mvmin, mvmax);
132
133	0	intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
134	0	pixel *temp = src + pixoff + fpeloffset;
135
136	0	int xFrac = mv.x & 7;
137	0	int yFrac = mv.y & 7;
138	0	if (!(yFrac \| xFrac))
139	0	{
140	0	primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
141	0	}
142	0	else if (!yFrac)
143	0	{
144	0	primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
145	0	}
146	0	else if (!xFrac)
147	0	{
148	0	primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
149	0	}
150	0	else
151	0	{
152	0	ALIGN_VAR_16(int16_t, immed[16 * (16 + NTAPS_CHROMA - 1)]);
153	0	primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, immed, bw, xFrac, 1);
154	0	primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(immed + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
155	0	}
156	0	}
157	0	else
158	0	{
159	0	primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
160	0	}
161	0	}
162	0	}
163	0	}
164
165		/* Measure sum of 8x8 satd costs between source frame and reference
166		* frame (potentially weighted, potentially motion compensated). We
167		* always use source images for this analysis since reference recon
168		* pixels have unreliable availability */
169		uint32_t weightCost(pixel * fenc,
170		pixel * ref,
171		pixel * weightTemp,
172		intptr_t stride,
173		const Cache & cache,
174		int width,
175		int height,
176		WeightParam * w,
177		bool bLuma)
178	0	{
179	0	if (w)
180	0	{
181		/* make a weighted copy of the reference plane */
182	0	int offset = w->inputOffset << (X265_DEPTH - 8);
183	0	int weight = w->inputWeight;
184	0	int denom = w->log2WeightDenom;
185	0	int round = denom ? 1 << (denom - 1) : 0;
186	0	int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
187	0	int pwidth = ((width + 31) >> 5) << 5;
188	0	primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
189	0	weight, round << correction, denom + correction, offset);
190	0	ref = weightTemp;
191	0	}
192
193	0	uint32_t cost = 0;
194	0	pixel f = fenc, r = ref;
195
196	0	if (bLuma)
197	0	{
198	0	int cu = 0;
199	0	for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
200	0	{
201	0	for (int x = 0; x < width; x += 8, cu++)
202	0	{
203	0	int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
204	0	cost += X265_MIN(cmp, cache.intraCost[cu]);
205	0	}
206	0	}
207	0	}
208	0	else if (cache.csp == X265_CSP_I444)
209	0	for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
210	0	for (int x = 0; x < width; x += 16)
211	0	cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
212	0	else
213	0	for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
214	0	for (int x = 0; x < width; x += 8)
215	0	cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
216
217	0	return cost;
218	0	}
219		}
220
221		namespace X265_NS {
222		void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
223	0	{
224	0	WeightParam wp[2][MAX_NUM_REF][3];
225	0	PicYuv *fencPic = frame.m_fencPic;
226	0	Lowres& fenc = frame.m_lowres;
227
228	0	Cache cache;
229
230	0	memset(&cache, 0, sizeof(cache));
231	0	cache.intraCost = fenc.intraCost;
232	0	cache.numPredDir = slice.isInterP() ? 1 : 2;
233	0	cache.lowresWidthInCU = fenc.width >> 3;
234	0	cache.lowresHeightInCU = fenc.lines >> 3;
235	0	cache.csp = param.internalCsp;
236	0	cache.hshift = CHROMA_H_SHIFT(cache.csp);
237	0	cache.vshift = CHROMA_V_SHIFT(cache.csp);
238
239		/* Use single allocation for motion compensated ref and weight buffers */
240	0	pixel mcbuf = X265_MALLOC(pixel, 2 fencPic->m_stride * fencPic->m_picHeight);
241	0	if (!mcbuf)
242	0	{
243	0	slice.disableWeights();
244	0	return;
245	0	}
246	0	pixel weightTemp = mcbuf + fencPic->m_stride fencPic->m_picHeight;
247
248	0	int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
249	0	int curPoc = slice.m_poc;
250	0	const float epsilon = 1.f / 128.f;
251
252	0	int chromaDenom, lumaDenom, denom;
253	0	chromaDenom = lumaDenom = 7;
254	0	int numpixels[3];
255	0	int w16 = ((fencPic->m_picWidth + 15) >> 4) << 4;
256	0	int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
257	0	numpixels[0] = w16 * h16;
258	0	numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
259
260	0	for (int list = 0; list < cache.numPredDir; list++)
261	0	{
262	0	WeightParam *weights = wp[list][0];
263	0	Frame *refFrame = slice.m_refFrameList[list][0];
264	0	Lowres& refLowres = refFrame->m_lowres;
265	0	int diffPoc = abs(curPoc - refFrame->m_poc);
266
267		/* prepare estimates */
268	0	float guessScale[3], fencMean[3], refMean[3];
269	0	for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
270	0	{
271	0	SET_WEIGHT(weights[plane], false, 1, 0, 0);
272	0	uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
273	0	uint64_t refVar = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
274	0	guessScale[plane] = sqrt((float)fencVar / refVar);
275	0	fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
276	0	refMean[plane] = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
277	0	}
278
279		/* make sure both our scale factors fit */
280	0	while (!list && chromaDenom > 0)
281	0	{
282	0	float thresh = 127.f / (1 << chromaDenom);
283	0	if (guessScale[1] < thresh && guessScale[2] < thresh)
284	0	break;
285	0	chromaDenom--;
286	0	}
287
288	0	SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
289	0	SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
290
291	0	MV *mvs = NULL;
292
293	0	for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
294	0	{
295	0	denom = plane ? chromaDenom : lumaDenom;
296	0	if (plane && !weights[0].wtPresent)
297	0	break;
298
299		/* Early termination */
300	0	x265_emms();
301	0	if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
302	0	{
303	0	SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
304	0	continue;
305	0	}
306
307	0	if (plane)
308	0	{
309	0	int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
310	0	if (scale > 127)
311	0	continue;
312	0	weights[plane].inputWeight = scale;
313	0	}
314	0	else
315	0	{
316	0	weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
317	0	}
318
319	0	int mindenom = weights[plane].log2WeightDenom;
320	0	int minscale = weights[plane].inputWeight;
321	0	int minoff = 0;
322
323	0	if (!plane && diffPoc <= param.bframes + 1)
324	0	{
325	0	mvs = fenc.lowresMvs[list][diffPoc];
326
327		/* test whether this motion search was performed by lookahead */
328	0	if (mvs[0].x != 0x7FFF)
329	0	{
330		/* reference chroma planes must be extended prior to being
331		* used as motion compensation sources */
332	0	if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400 && frame.m_fencPic->m_picCsp != X265_CSP_I400)
333	0	{
334	0	refFrame->m_bChromaExtended = true;
335	0	PicYuv *refPic = refFrame->m_fencPic;
336	0	int width = refPic->m_picWidth >> cache.hshift;
337	0	int height = refPic->m_picHeight >> cache.vshift;
338	0	extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
339	0	extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
340	0	}
341	0	}
342	0	else
343	0	mvs = 0;
344	0	}
345
346		/* prepare inputs to weight analysis */
347	0	pixel *orig;
348	0	pixel *fref;
349	0	intptr_t stride;
350	0	int width, height;
351	0	switch (plane)
352	0	{
353	0	case 0:
354	0	orig = fenc.lowresPlane[0];
355	0	stride = fenc.lumaStride;
356	0	width = fenc.width;
357	0	height = fenc.lines;
358	0	fref = refLowres.lowresPlane[0];
359	0	if (mvs)
360	0	{
361	0	mcLuma(mcbuf, refLowres, mvs);
362	0	fref = mcbuf;
363	0	}
364	0	break;
365
366	0	case 1:
367	0	orig = fencPic->m_picOrg[1];
368	0	stride = fencPic->m_strideC;
369	0	fref = refFrame->m_fencPic->m_picOrg[1];
370
371		/* Clamp the chroma dimensions to the nearest multiple of
372		* 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
373		* blocks and weightCost measures 8x8 blocks. This
374		* potentially ignores some edge pixels, but simplifies the
375		* logic and prevents reading uninitialized pixels. Lowres
376		* planes are border extended and require no clamping. */
377	0	width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
378	0	height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
379	0	if (mvs)
380	0	{
381	0	mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
382	0	fref = mcbuf;
383	0	}
384	0	break;
385
386	0	case 2:
387	0	orig = fencPic->m_picOrg[2];
388	0	stride = fencPic->m_strideC;
389	0	fref = refFrame->m_fencPic->m_picOrg[2];
390	0	width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
391	0	height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
392	0	if (mvs)
393	0	{
394	0	mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
395	0	fref = mcbuf;
396	0	}
397	0	break;
398
399	0	default:
400	0	slice.disableWeights();
401	0	X265_FREE(mcbuf);
402	0	return;
403	0	}
404
405	0	uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
406	0	if (!origscore)
407	0	{
408	0	SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
409	0	continue;
410	0	}
411
412	0	uint32_t minscore = origscore;
413	0	bool bFound = false;
414
415		/* x264 uses a table lookup here, selecting search range based on preset */
416	0	static const int scaleDist = 4;
417	0	static const int offsetDist = 2;
418
419	0	int startScale = x265_clip3(0, 127, minscale - scaleDist);
420	0	int endScale = x265_clip3(0, 127, minscale + scaleDist);
421	0	for (int scale = startScale; scale <= endScale; scale++)
422	0	{
423	0	int deltaWeight = scale - (1 << mindenom);
424	0	if (deltaWeight > 127 \|\| deltaWeight <= -128)
425	0	continue;
426
427	0	x265_emms();
428	0	int curScale = scale;
429	0	int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
430	0	if (curOffset < -128 \|\| curOffset > 127)
431	0	{
432		/* Rescale considering the constraints on curOffset. We do it in this order
433		* because scale has a much wider range than offset (because of denom), so
434		* it should almost never need to be clamped. */
435	0	curOffset = x265_clip3(-128, 127, curOffset);
436	0	curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
437	0	curScale = x265_clip3(0, 127, curScale);
438	0	}
439
440	0	int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
441	0	int endOffset = x265_clip3(-128, 127, curOffset + offsetDist);
442	0	for (int off = startOffset; off <= endOffset; off++)
443	0	{
444	0	WeightParam wsp;
445	0	SET_WEIGHT(wsp, true, curScale, mindenom, off);
446	0	uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
447	0	sliceHeaderCost(&wsp, lambda, !!plane);
448	0	COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
449
450		/* Don't check any more offsets if the previous one had a lower cost than the current one */
451	0	if (minoff == startOffset && off != startOffset)
452	0	break;
453	0	}
454	0	}
455
456		/* Use a smaller luma denominator if possible */
457	0	if (!(plane \|\| list))
458	0	{
459	0	if (mindenom > 0 && !(minscale & 1))
460	0	{
461	0	unsigned long idx;
462	0	CTZ(idx, minscale);
463	0	int shift = X265_MIN((int)idx, mindenom);
464	0	mindenom -= shift;
465	0	minscale >>= shift;
466	0	}
467	0	}
468
469	0	if (!bFound \|\| (minscale == (1 << mindenom) && minoff == 0) \|\| (float)minscore / origscore > 0.998f)
470	0	{
471	0	SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
472	0	}
473	0	else
474	0	{
475	0	SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
476	0	}
477	0	}
478
479	0	if (weights[0].wtPresent)
480	0	{
481		// Make sure both chroma channels match
482	0	if (weights[1].wtPresent != weights[2].wtPresent)
483	0	{
484	0	if (weights[1].wtPresent)
485	0	weights[2] = weights[1];
486	0	else
487	0	weights[1] = weights[2];
488	0	}
489	0	}
490
491	0	lumaDenom = weights[0].log2WeightDenom;
492	0	chromaDenom = weights[1].log2WeightDenom;
493
494		/* reset weight states */
495	0	for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
496	0	{
497	0	SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
498	0	SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
499	0	SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
500	0	}
501	0	}
502
503	0	X265_FREE(mcbuf);
504
505	0	memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
506
507	0	if (param.logLevel >= X265_LOG_FULL)
508	0	{
509	0	char buf[1024];
510	0	int p = 0;
511	0	bool bWeighted = false;
512
513	0	p = sprintf(buf, "poc: %d weights:", slice.m_poc);
514	0	int numPredDir = slice.isInterP() ? 1 : 2;
515	0	for (int list = 0; list < numPredDir; list++)
516	0	{
517	0	WeightParam* w = &wp[list][0][0];
518	0	if (w[0].wtPresent \|\| w[1].wtPresent \|\| w[2].wtPresent)
519	0	{
520	0	bWeighted = true;
521	0	p += sprintf(buf + p, " [L%d:R0 ", list);
522	0	if (w[0].wtPresent)
523	0	p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
524	0	if (w[1].wtPresent)
525	0	p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
526	0	if (w[2].wtPresent)
527	0	p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
528	0	p += sprintf(buf + p, "]");
529	0	}
530	0	}
531
532	0	if (bWeighted)
533	0	{
534	0	if (p < 80) // pad with spaces to ensure progress line overwritten
535	0	sprintf(buf + p, "%*s", 80 - p, " ");
536	0	x265_log(&param, X265_LOG_FULL, "%s\n", buf);
537	0	}
538	0	}
539	0	}
540		}