Coverage Report

Created: 2025-07-23 08:18

/src/x265/source/encoder/search.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Steve Borho <steve@borho.org>
5
*          Min Chen <chenm003@163.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 2 of the License, or
10
* (at your option) any later version.
11
*
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
* GNU General Public License for more details.
16
*
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
*
21
* This program is also available under a commercial proprietary license.
22
* For more information, contact us at license @ x265.com.
23
*****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "picyuv.h"
28
#include "cudata.h"
29
30
#include "search.h"
31
#include "entropy.h"
32
#include "rdcost.h"
33
34
#include "analysis.h"  // TLD
35
#include "framedata.h"
36
37
using namespace X265_NS;
38
39
#if _MSC_VER
40
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
41
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
42
#pragma warning(disable: 4127) // conditional expression is constant
43
#endif
44
45
0
#define MVP_IDX_BITS 1
46
47
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
48
49
Search::Search()
50
0
{
51
0
    memset(m_rqt, 0, sizeof(m_rqt));
52
53
0
    for (int i = 0; i < 3; i++)
54
0
    {
55
0
        m_qtTempTransformSkipFlag[i] = NULL;
56
0
        m_qtTempCbf[i] = NULL;
57
0
    }
58
59
0
    m_numLayers = 0;
60
0
    m_intraPred = NULL;
61
0
    m_intraPredAngs = NULL;
62
0
    m_fencScaled = NULL;
63
0
    m_fencTransposed = NULL;
64
0
    m_tsCoeff = NULL;
65
0
    m_tsResidual = NULL;
66
0
    m_tsRecon = NULL;
67
0
    m_param = NULL;
68
0
    m_slice = NULL;
69
0
    m_frame = NULL;
70
0
    m_maxTUDepth = -1;
71
0
}
72
73
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
74
0
{
75
0
    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
76
0
    m_param = &param;
77
0
    m_bFrameParallel = param.frameNumThreads > 1;
78
0
    m_numLayers = g_log2Size[param.maxCUSize] - 2;
79
#if ENABLE_SCC_EXT
80
    m_ibcEnabled = param.bEnableSCC;
81
#endif
82
83
0
    m_rdCost.setPsyRdScale(param.psyRd);
84
0
    m_rdCost.setSsimRd(param.bSsimRd);
85
0
    m_me.init(param.internalCsp);
86
87
0
    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
88
0
    if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
89
0
        ok &= m_quant.allocNoiseReduction(param);
90
91
0
    ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
92
93
    /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
94
     * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
95
0
    m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
96
97
0
    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
98
0
    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
99
0
    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
100
101
0
    m_limitTU = 0;
102
0
    if (m_param->limitTU)
103
0
    {
104
0
        if (m_param->limitTU == 1)
105
0
            m_limitTU = X265_TU_LIMIT_BFS;
106
0
        else if (m_param->limitTU == 2)
107
0
            m_limitTU = X265_TU_LIMIT_DFS;
108
0
        else if (m_param->limitTU == 3)
109
0
            m_limitTU = X265_TU_LIMIT_NEIGH;
110
0
        else if (m_param->limitTU == 4)
111
0
            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
112
0
    }
113
114
    /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
115
     * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
116
     * which are reconstructed at each depth are valid. At the end, the transform depth table
117
     * is walked and the coeff and recon at the correct depths are collected */
118
119
0
    if (param.internalCsp != X265_CSP_I400)
120
0
    {
121
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
122
0
        {
123
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
124
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
125
0
            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
126
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
127
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
128
0
        }
129
0
    }
130
0
    else
131
0
    {
132
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
133
0
        {
134
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
135
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
136
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
137
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
138
0
        }
139
0
    }
140
141
    /* the rest of these buffers are indexed per-depth */
142
0
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
143
0
    {
144
0
        int cuSize = param.maxCUSize >> i;
145
0
        ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
146
0
        ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
147
0
        ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
148
0
        ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
149
0
    }
150
151
0
    if (param.internalCsp != X265_CSP_I400)
152
0
    {
153
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
154
0
        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
155
0
        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
156
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
157
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
158
0
        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
159
0
    }
160
0
    else
161
0
    {
162
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
163
0
        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
164
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
165
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
166
0
    }
167
168
0
    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
169
0
    m_fencScaled = m_intraPred + 32 * 32;
170
0
    m_fencTransposed = m_fencScaled + 32 * 32;
171
0
    m_intraPredAngs = m_fencTransposed + 32 * 32;
172
173
0
    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
174
0
    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
175
0
    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
176
177
#if ENABLE_SCC_EXT
178
    m_numBVs = 0;
179
    m_numBV16s = 0;
180
#endif
181
182
0
    return ok;
183
184
0
fail:
185
0
    return false;
186
0
}
187
188
Search::~Search()
189
0
{
190
0
    for (uint32_t i = 0; i <= m_numLayers; i++)
191
0
    {
192
0
        X265_FREE(m_rqt[i].coeffRQT[0]);
193
0
        m_rqt[i].reconQtYuv.destroy();
194
0
        m_rqt[i].resiQtYuv.destroy();
195
0
    }
196
197
0
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
198
0
    {
199
0
        m_rqt[i].tmpResiYuv.destroy();
200
0
        m_rqt[i].tmpPredYuv.destroy();
201
0
        m_rqt[i].bidirPredYuv[0].destroy();
202
0
        m_rqt[i].bidirPredYuv[1].destroy();
203
0
    }
204
205
0
    X265_FREE(m_qtTempCbf[0]);
206
0
    X265_FREE(m_qtTempTransformSkipFlag[0]);
207
0
    X265_FREE(m_intraPred);
208
0
    X265_FREE(m_tsCoeff);
209
0
    X265_FREE(m_tsResidual);
210
0
    X265_FREE(m_tsRecon);
211
0
}
212
213
int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
214
0
{
215
0
    X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
216
217
0
    m_me.setQP(qp);
218
0
    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
219
220
0
    int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
221
0
    m_quant.setQPforQuant(ctu, quantQP);
222
0
    return quantQP;
223
0
}
224
225
#if CHECKED_BUILD || _DEBUG
226
void Search::invalidateContexts(int fromDepth)
227
{
228
    /* catch reads without previous writes */
229
    for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
230
    {
231
        m_rqt[d].cur.markInvalid();
232
        m_rqt[d].rqtTemp.markInvalid();
233
        m_rqt[d].rqtRoot.markInvalid();
234
        m_rqt[d].rqtTest.markInvalid();
235
    }
236
}
237
#else
238
0
void Search::invalidateContexts(int) {}
239
#endif
240
241
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
242
0
{
243
0
    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
244
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
245
246
0
    if (!(log2TrSize - m_hChromaShift < 2))
247
0
    {
248
0
        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
249
0
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
250
0
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
251
0
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
252
0
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
253
0
    }
254
255
0
    if (subdiv)
256
0
    {
257
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
258
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
259
0
            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
260
0
    }
261
0
}
262
263
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
264
0
{
265
0
    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
266
0
        return;
267
268
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
269
270
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
271
0
    {
272
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
273
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
274
0
            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
275
276
0
        return;
277
0
    }
278
279
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
280
281
0
    if (log2TrSizeC < 2)
282
0
    {
283
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
284
0
        if (absPartIdx & 3)
285
0
            return;
286
0
        log2TrSizeC = 2;
287
0
    }
288
289
0
    uint32_t qtLayer = log2TrSize - 2;
290
291
0
    if (m_csp != X265_CSP_I422)
292
0
    {
293
0
        uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
294
0
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
295
0
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
296
0
        m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
297
0
    }
298
0
    else
299
0
    {
300
0
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
301
0
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
302
0
        uint32_t subTUSize = 1 << (log2TrSizeC * 2);
303
0
        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
304
0
        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
305
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
306
0
        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
307
0
            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
308
0
    }
309
0
}
310
311
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
312
0
{
313
0
    CUData& cu = mode.cu;
314
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
315
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
316
0
    uint32_t qtLayer    = log2TrSize - 2;
317
0
    uint32_t sizeIdx    = log2TrSize - 2;
318
0
    bool mightNotSplit  = log2TrSize <= depthRange[1];
319
0
    bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
320
0
    bool bEnableRDOQ  = !!m_param->rdoqLevel;
321
322
    /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
323
0
    if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
324
0
    {
325
0
        mightNotSplit = false;
326
0
        mightSplit = true;
327
0
    }
328
329
0
    Cost fullCost;
330
0
    uint32_t bCBF = 0;
331
332
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
333
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
334
335
0
    if (mightNotSplit)
336
0
    {
337
0
        if (mightSplit)
338
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
339
340
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
341
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
342
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
343
0
        uint32_t stride   = mode.fencYuv->m_size;
344
345
        // init availability pattern
346
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
347
0
        IntraNeighbors intraNeighbors;
348
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
349
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
350
351
        // get prediction signal
352
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
353
354
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
355
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
356
357
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
358
0
        coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
359
360
        // store original entropy coding status
361
0
        if (bEnableRDOQ)
362
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
363
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
364
365
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
366
0
        if (numSig)
367
0
        {
368
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
369
0
            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
370
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
371
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
372
0
            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
373
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
374
0
        }
375
0
        else
376
            // no coded residual, recon = pred
377
0
            primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
378
379
0
        bCBF = !!numSig << tuDepth;
380
0
        cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
381
0
        fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
382
383
0
        m_entropyCoder.resetBits();
384
0
        if (!absPartIdx)
385
0
        {
386
0
            if (!cu.m_slice->isIntra())
387
0
            {
388
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
389
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
390
0
                m_entropyCoder.codeSkipFlag(cu, 0);
391
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
392
0
            }
393
394
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
395
0
        }
396
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
397
0
        {
398
0
            if (!absPartIdx)
399
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
400
0
        }
401
0
        else
402
0
        {
403
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
404
0
            if (!tuDepth)
405
0
            {
406
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
407
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
408
0
            }
409
0
            else if (!(absPartIdx & (qNumParts - 1)))
410
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
411
0
        }
412
0
        if (log2TrSize != depthRange[0])
413
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
414
415
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
416
417
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
418
0
            m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
419
420
0
        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
421
422
0
        if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
423
0
            fullCost.bits *= 4;
424
425
0
        if (m_rdCost.m_psyRd)
426
0
        {
427
0
            fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
428
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
429
0
        }
430
0
        else if(m_rdCost.m_ssimRd)
431
0
        {
432
0
            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
433
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
434
0
        }
435
0
        else
436
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
437
0
    }
438
0
    else
439
0
        fullCost.rdcost = MAX_INT64;
440
441
0
    if (mightSplit)
442
0
    {
443
0
        if (mightNotSplit)
444
0
        {
445
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
446
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
447
0
        }
448
449
        /* code split block */
450
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
451
452
0
        int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
453
0
        if (m_param->bEnableTSkipFast)
454
0
            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
455
456
0
        Cost splitCost;
457
0
        uint32_t cbf = 0;
458
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
459
0
        {
460
0
            if (checkTransformSkip)
461
0
                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
462
0
            else
463
0
                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
464
465
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
466
0
        }
467
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
468
469
0
        if (mightNotSplit && log2TrSize != depthRange[0])
470
0
        {
471
            /* If we could have coded this TU depth, include cost of subdiv flag */
472
0
            m_entropyCoder.resetBits();
473
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
474
0
            splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
475
476
0
            if (m_rdCost.m_psyRd)
477
0
                splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
478
0
            else if(m_rdCost.m_ssimRd)
479
0
                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
480
0
            else
481
0
                splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
482
0
        }
483
484
0
        if (splitCost.rdcost < fullCost.rdcost)
485
0
        {
486
0
            outCost.rdcost     += splitCost.rdcost;
487
0
            outCost.distortion += splitCost.distortion;
488
0
            outCost.bits       += splitCost.bits;
489
0
            outCost.energy     += splitCost.energy;
490
0
            return;
491
0
        }
492
0
        else
493
0
        {
494
            // recover entropy state of full-size TU encode
495
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
496
497
            // recover transform index and Cbf values
498
0
            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
499
0
            cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
500
0
            cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
501
0
        }
502
0
    }
503
504
    // set reconstruction for next intra prediction blocks if full TU prediction won
505
0
    PicYuv*  reconPic = m_frame->m_reconPic[0];
506
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
507
0
    intptr_t picStride = reconPic->m_stride;
508
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
509
510
0
    outCost.rdcost     += fullCost.rdcost;
511
0
    outCost.distortion += fullCost.distortion;
512
0
    outCost.bits       += fullCost.bits;
513
0
    outCost.energy     += fullCost.energy;
514
0
}
515
516
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
517
0
{
518
0
    uint32_t fullDepth = cuGeom.depth + tuDepth;
519
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
520
0
    uint32_t tuSize = 1 << log2TrSize;
521
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
522
523
0
    X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
524
525
0
    CUData& cu = mode.cu;
526
0
    Yuv* predYuv = &mode.predYuv;
527
0
    const Yuv* fencYuv = mode.fencYuv;
528
529
0
    Cost fullCost;
530
0
    fullCost.rdcost = MAX_INT64;
531
0
    int      bTSkip = 0;
532
0
    uint32_t bCBF = 0;
533
534
0
    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
535
0
    pixel*   pred = predYuv->getLumaAddr(absPartIdx);
536
0
    int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
537
0
    uint32_t stride = fencYuv->m_size;
538
0
    uint32_t sizeIdx = log2TrSize - 2;
539
540
    // init availability pattern
541
0
    uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
542
0
    IntraNeighbors intraNeighbors;
543
0
    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
544
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
545
546
    // get prediction signal
547
0
    predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
548
549
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
550
551
0
    uint32_t qtLayer = log2TrSize - 2;
552
0
    uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
553
0
    coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
554
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
555
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
556
557
    // store original entropy coding status
558
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
559
560
0
    if (bEnableRDOQ)
561
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
562
563
0
    int checkTransformSkip = 1;
564
0
    for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
565
0
    {
566
0
        uint64_t tmpCost;
567
0
        uint32_t tmpEnergy = 0;
568
569
0
        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
570
0
        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
571
0
        bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
572
0
        uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
573
574
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
575
576
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
577
0
        if (numSig)
578
0
        {
579
0
            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
580
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
581
0
            bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
582
0
            bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
583
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
584
0
        }
585
0
        else if (useTSkip)
586
0
        {
587
            /* do not allow tskip if CBF=0, pretend we did not try tskip */
588
0
            checkTransformSkip = 0;
589
0
            break;
590
0
        }
591
0
        else
592
            // no residual coded, recon = pred
593
0
            primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
594
595
0
        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
596
597
0
        cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
598
0
        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
599
600
0
        if (useTSkip)
601
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
602
603
0
        m_entropyCoder.resetBits();
604
0
        if (!absPartIdx)
605
0
        {
606
0
            if (!cu.m_slice->isIntra())
607
0
            {
608
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
609
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
610
0
                m_entropyCoder.codeSkipFlag(cu, 0);
611
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
612
0
            }
613
614
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
615
0
        }
616
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
617
0
        {
618
0
            if (!absPartIdx)
619
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
620
0
        }
621
0
        else
622
0
        {
623
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
624
0
            if (!tuDepth)
625
0
            {
626
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
627
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
628
0
            }
629
0
            else if (!(absPartIdx & (qNumParts - 1)))
630
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
631
0
        }
632
0
        m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
633
634
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
635
636
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
637
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
638
639
0
        uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
640
641
0
        if (!useTSkip)
642
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
643
644
0
        if (m_rdCost.m_psyRd)
645
0
        {
646
0
            tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
647
0
            tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
648
0
        }
649
0
        else if(m_rdCost.m_ssimRd)
650
0
        {
651
0
            tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx);
652
0
            tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
653
0
        }
654
0
        else
655
0
            tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
656
657
0
        if (tmpCost < fullCost.rdcost)
658
0
        {
659
0
            bTSkip = useTSkip;
660
0
            bCBF = !!numSig;
661
0
            fullCost.rdcost = tmpCost;
662
0
            fullCost.distortion = tmpDist;
663
0
            fullCost.bits = tmpBits;
664
0
            fullCost.energy = tmpEnergy;
665
0
        }
666
0
    }
667
668
0
    if (bTSkip)
669
0
    {
670
0
        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
671
0
        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
672
0
    }
673
0
    else if (checkTransformSkip)
674
0
    {
675
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
676
0
        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
677
0
        m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
678
0
    }
679
680
    // set reconstruction for next intra prediction blocks
681
0
    PicYuv*  reconPic = m_frame->m_reconPic[0];
682
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
683
0
    intptr_t picStride = reconPic->m_stride;
684
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
685
686
0
    outCost.rdcost += fullCost.rdcost;
687
0
    outCost.distortion += fullCost.distortion;
688
0
    outCost.bits += fullCost.bits;
689
0
    outCost.energy += fullCost.energy;
690
0
}
691
692
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
693
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
694
0
{
695
0
    CUData& cu = mode.cu;
696
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
697
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
698
0
    bool     bCheckFull = log2TrSize <= depthRange[1];
699
700
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
701
702
    /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
703
     * since we are not measuring RD cost */
704
0
    if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
705
0
        bCheckFull = false;
706
707
0
    if (bCheckFull)
708
0
    {
709
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
710
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
711
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
712
0
        uint32_t stride   = mode.fencYuv->m_size;
713
714
        // init availability pattern
715
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
716
0
        IntraNeighbors intraNeighbors;
717
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
718
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
719
720
        // get prediction signal
721
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
722
723
0
        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
724
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
725
726
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
727
0
        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
728
729
0
        uint32_t sizeIdx   = log2TrSize - 2;
730
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
731
732
0
        PicYuv*  reconPic = m_frame->m_reconPic[0];
733
0
        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
734
0
        intptr_t picStride = reconPic->m_stride;
735
736
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
737
0
        if (numSig)
738
0
        {
739
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
740
0
            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
741
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
742
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
743
0
            bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
744
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
745
0
            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
746
0
        }
747
0
        else
748
0
        {
749
0
            primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
750
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
751
0
        }
752
0
    }
753
0
    else
754
0
    {
755
0
        X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
756
757
        /* code split block */
758
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
759
0
        uint32_t cbf = 0;
760
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
761
0
        {
762
0
            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
763
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
764
0
        }
765
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
766
0
    }
767
0
}
768
769
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
770
0
{
771
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
772
773
0
    if (tuDepth == cu.m_tuDepth[absPartIdx])
774
0
    {
775
0
        uint32_t qtLayer    = log2TrSize - 2;
776
777
        // copy transform coefficients
778
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
779
0
        coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
780
0
        coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
781
0
        memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
782
783
        // copy reconstruction
784
0
        m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
785
0
    }
786
0
    else
787
0
    {
788
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
789
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
790
0
            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
791
0
    }
792
0
}
793
794
inline void offsetCBFs(uint8_t subTUCBF[2])
795
0
{
796
0
    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
797
0
    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
798
0
    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
799
0
}
800
801
/* 4:2:2 post-TU split processing */
802
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
803
0
{
804
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
805
806
0
    if (log2TrSize == 2)
807
0
    {
808
0
        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
809
0
        ++log2TrSize;
810
0
    }
811
812
0
    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
813
814
    // move the CBFs down a level and set the parent CBF
815
0
    uint8_t subTUCBF[2];
816
0
    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
817
0
    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
818
0
    offsetCBFs(subTUCBF);
819
820
0
    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
821
0
    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
822
0
}
823
824
/* returns distortion */
825
void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
826
0
{
827
0
    CUData& cu = mode.cu;
828
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
829
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
830
831
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
832
0
    {
833
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
834
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
835
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
836
0
        {
837
0
            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
838
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
839
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
840
0
        }
841
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
842
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
843
844
0
        return;
845
0
    }
846
847
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
848
0
    uint32_t tuDepthC = tuDepth;
849
0
    if (log2TrSizeC < 2)
850
0
    {
851
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
852
0
        if (absPartIdx & 3)
853
0
            return;
854
0
        log2TrSizeC = 2;
855
0
        tuDepthC--;
856
0
    }
857
858
0
    if (bEnableRDOQ)
859
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
860
861
0
    bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
862
0
    checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
863
0
    if (checkTransformSkip)
864
0
    {
865
0
        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
866
0
        return;
867
0
    }
868
869
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
870
0
    uint32_t qtLayer = log2TrSize - 2;
871
0
    uint32_t stride = mode.fencYuv->m_csize;
872
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
873
874
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
875
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
876
877
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
878
0
    do
879
0
    {
880
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
881
882
0
        IntraNeighbors intraNeighbors;
883
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
884
885
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
886
0
        {
887
0
            TextType ttype = (TextType)chromaId;
888
889
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
890
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
891
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
892
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
893
0
            coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
894
0
            pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
895
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
896
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
897
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
898
0
            intptr_t picStride = reconPic->m_strideC;
899
900
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
901
0
            if (chromaPredMode == DM_CHROMA_IDX)
902
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
903
0
            if (m_csp == X265_CSP_I422)
904
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
905
906
            // init availability pattern
907
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
908
909
            // get prediction signal
910
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
911
0
            cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
912
913
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
914
915
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
916
0
            if (numSig)
917
0
            {
918
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
919
0
                bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
920
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
921
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
922
0
                bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
923
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
924
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
925
0
            }
926
0
            else
927
0
            {
928
                // no coded residual, recon = pred
929
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
930
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
931
0
            }
932
933
0
            outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
934
935
0
            if (m_rdCost.m_psyRd)
936
0
                outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
937
0
            else if(m_rdCost.m_ssimRd)
938
0
                outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
939
940
0
            primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
941
0
        }
942
0
    }
943
0
    while (tuIterator.isNextSection());
944
945
0
    if (splitType == VERTICAL_SPLIT)
946
0
    {
947
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
948
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
949
0
    }
950
0
}
951
952
/* returns distortion */
953
void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
954
0
{
955
0
    CUData& cu = mode.cu;
956
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
957
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
958
0
    const uint32_t log2TrSizeC = 2;
959
0
    uint32_t qtLayer = log2TrSize - 2;
960
961
    /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
962
     * so the entropy coder is not very accurate. The best we can do is return it in the same
963
     * condition as it arrived, and to do all bit estimates from the same state. */
964
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
965
966
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
967
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
968
969
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
970
0
    do
971
0
    {
972
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
973
974
0
        IntraNeighbors intraNeighbors;
975
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
976
977
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
978
0
        {
979
0
            TextType ttype = (TextType)chromaId;
980
981
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
982
0
            pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
983
0
            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
984
0
            uint32_t stride = mode.fencYuv->m_csize;
985
0
            const uint32_t sizeIdxC = log2TrSizeC - 2;
986
987
0
            uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
988
0
            coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
989
0
            pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
990
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
991
992
            // init availability pattern
993
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
994
995
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
996
0
            if (chromaPredMode == DM_CHROMA_IDX)
997
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
998
0
            if (m_csp == X265_CSP_I422)
999
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1000
1001
            // get prediction signal
1002
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1003
1004
0
            uint64_t bCost = MAX_INT64;
1005
0
            sse_t bDist = 0;
1006
0
            uint32_t bCbf = 0;
1007
0
            uint32_t bEnergy = 0;
1008
0
            int      bTSkip = 0;
1009
1010
0
            int checkTransformSkip = 1;
1011
0
            for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
1012
0
            {
1013
0
                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
1014
0
                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
1015
0
                uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
1016
1017
0
                primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1018
1019
0
                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
1020
0
                if (numSig)
1021
0
                {
1022
0
                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
1023
0
                    bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
1024
0
                    bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1025
0
                    bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1026
0
                    bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
1027
0
                    primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
1028
0
                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1029
0
                }
1030
0
                else if (useTSkip)
1031
0
                {
1032
0
                    checkTransformSkip = 0;
1033
0
                    break;
1034
0
                }
1035
0
                else
1036
0
                {
1037
0
                    primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
1038
0
                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1039
0
                }
1040
0
                sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
1041
0
                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
1042
1043
0
                cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1044
1045
0
                uint32_t tmpBits = 0, tmpEnergy = 0;
1046
0
                if (numSig)
1047
0
                {
1048
0
                    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1049
0
                    m_entropyCoder.resetBits();
1050
0
                    m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
1051
0
                    tmpBits = m_entropyCoder.getNumberOfWrittenBits();
1052
0
                }
1053
1054
0
                uint64_t tmpCost;
1055
0
                if (m_rdCost.m_psyRd)
1056
0
                {
1057
0
                    tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1058
0
                    tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
1059
0
                }
1060
0
                else if(m_rdCost.m_ssimRd)
1061
0
                {
1062
0
                    tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1063
0
                    tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
1064
0
                }
1065
0
                else
1066
0
                    tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
1067
1068
0
                if (tmpCost < bCost)
1069
0
                {
1070
0
                    bCost = tmpCost;
1071
0
                    bDist = tmpDist;
1072
0
                    bTSkip = useTSkip;
1073
0
                    bCbf = !!numSig;
1074
0
                    bEnergy = tmpEnergy;
1075
0
                }
1076
0
            }
1077
1078
0
            if (bTSkip)
1079
0
            {
1080
0
                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
1081
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
1082
0
            }
1083
1084
0
            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1085
0
            cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1086
1087
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1088
0
            pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1089
0
            intptr_t picStride = reconPic->m_strideC;
1090
0
            primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
1091
1092
0
            outCost.distortion += bDist;
1093
0
            outCost.energy += bEnergy;
1094
0
        }
1095
0
    }
1096
0
    while (tuIterator.isNextSection());
1097
1098
0
    if (splitType == VERTICAL_SPLIT)
1099
0
    {
1100
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1101
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1102
0
    }
1103
1104
0
    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1105
0
}
1106
1107
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
1108
0
{
1109
0
    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1110
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1111
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1112
1113
0
    if (tuDepthL == tuDepth || log2TrSizeC == 2)
1114
0
    {
1115
        // copy transform coefficients
1116
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1117
0
        uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1118
1119
0
        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
1120
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1121
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1122
0
        coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1123
0
        coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1124
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1125
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1126
1127
        // copy reconstruction
1128
0
        m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1129
0
    }
1130
0
    else
1131
0
    {
1132
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1133
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1134
0
            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1135
0
    }
1136
0
}
1137
1138
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
1139
0
{
1140
0
    CUData& cu = mode.cu;
1141
0
    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
1142
1143
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1144
0
    {
1145
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1146
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
1147
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1148
0
        {
1149
0
            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
1150
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1151
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1152
0
        }
1153
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1154
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1155
1156
0
        return;
1157
0
    }
1158
1159
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1160
0
    uint32_t tuDepthC = tuDepth;
1161
0
    if (log2TrSizeC < 2)
1162
0
    {
1163
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1164
0
        if (absPartIdx & 3)
1165
0
            return;
1166
0
        log2TrSizeC = 2;
1167
0
        tuDepthC--;
1168
0
    }
1169
1170
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1171
0
    uint32_t stride = mode.fencYuv->m_csize;
1172
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1173
1174
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1175
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1176
1177
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1178
0
    do
1179
0
    {
1180
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1181
1182
0
        IntraNeighbors intraNeighbors;
1183
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1184
1185
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1186
0
        {
1187
0
            TextType ttype = (TextType)chromaId;
1188
1189
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1190
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1191
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1192
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1193
0
            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
1194
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1195
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1196
0
            intptr_t picStride = reconPic->m_strideC;
1197
1198
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1199
0
            if (chromaPredMode == DM_CHROMA_IDX)
1200
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1201
0
            if (m_csp == X265_CSP_I422)
1202
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1203
1204
            // init availability pattern
1205
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1206
1207
            // get prediction signal
1208
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1209
1210
0
            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1211
1212
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1213
1214
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1215
0
            if (numSig)
1216
0
            {
1217
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1218
0
                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
1219
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1220
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
1221
0
                bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
1222
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
1223
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1224
0
            }
1225
0
            else
1226
0
            {
1227
                // no coded residual, recon = pred
1228
0
                primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
1229
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1230
0
            }
1231
0
        }
1232
0
    }
1233
0
    while (tuIterator.isNextSection());
1234
1235
0
    if (splitType == VERTICAL_SPLIT)
1236
0
    {
1237
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1238
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1239
0
    }
1240
0
}
1241
1242
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
1243
0
{
1244
0
    CUData& cu = intraMode.cu;
1245
1246
0
    cu.setPartSizeSubParts(partSize);
1247
0
    cu.setPredModeSubParts(MODE_INTRA);
1248
1249
0
    uint32_t tuDepthRange[2];
1250
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1251
1252
0
    intraMode.initCosts();
1253
0
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
1254
0
    if (m_csp != X265_CSP_I400)
1255
0
    {
1256
0
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
1257
0
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
1258
0
    }
1259
0
    else
1260
0
        intraMode.distortion += intraMode.lumaDistortion;
1261
0
    cu.m_distortion[0] = intraMode.distortion;
1262
0
    m_entropyCoder.resetBits();
1263
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1264
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1265
1266
0
    int skipFlagBits = 0;
1267
0
    if (!m_slice->isIntra())
1268
0
    {
1269
0
        m_entropyCoder.codeSkipFlag(cu, 0);
1270
0
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1271
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
1272
0
    }
1273
1274
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1275
0
    m_entropyCoder.codePredInfo(cu, 0);
1276
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1277
1278
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1279
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1280
0
    m_entropyCoder.store(intraMode.contexts);
1281
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1282
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1283
0
    const Yuv* fencYuv = intraMode.fencYuv;
1284
0
    if (m_rdCost.m_psyRd)
1285
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1286
0
    else if(m_rdCost.m_ssimRd)
1287
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1288
1289
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1290
1291
0
    updateModeCost(intraMode);
1292
0
    checkDQP(intraMode, cuGeom);
1293
1294
#if ENABLE_SCC_EXT
1295
    if (m_param->bEnableSCC)
1296
        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
1297
#endif
1298
0
}
1299
1300
/* Note that this function does not save the best intra prediction, it must
1301
 * be generated later. It records the best mode in the cu */
1302
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1303
0
{
1304
0
    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1305
1306
0
    CUData& cu = intraMode.cu;
1307
0
    uint32_t depth = cuGeom.depth;
1308
1309
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
1310
0
    cu.setPredModeSubParts(MODE_INTRA);
1311
1312
0
    const uint32_t initTuDepth = 0;
1313
0
    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
1314
0
    uint32_t tuSize = 1 << log2TrSize;
1315
0
    const uint32_t absPartIdx = 0;
1316
1317
    // Reference sample smoothing
1318
0
    IntraNeighbors intraNeighbors;
1319
0
    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1320
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1321
1322
0
    const pixel* fenc = intraMode.fencYuv->m_buf[0];
1323
0
    uint32_t stride = intraMode.fencYuv->m_size;
1324
1325
0
    int sad, bsad;
1326
0
    uint32_t bits, bbits, mode, bmode;
1327
0
    uint64_t cost, bcost;
1328
1329
    // 33 Angle modes once
1330
0
    int scaleTuSize = tuSize;
1331
0
    int scaleStride = stride;
1332
0
    int costShift = 0;
1333
0
    int sizeIdx = log2TrSize - 2;
1334
1335
0
    if (tuSize > 32)
1336
0
    {
1337
        // CU is 64x64, we scale to 32x32 and adjust required parameters
1338
0
        primitives.scale2D_64to32(m_fencScaled, fenc, stride);
1339
0
        fenc = m_fencScaled;
1340
1341
0
        pixel nScale[129];
1342
0
        intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
1343
0
        primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
1344
1345
        // we do not estimate filtering for downscaled samples
1346
0
        memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));   // Top & Left pixels
1347
0
        memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
1348
1349
0
        scaleTuSize = 32;
1350
0
        scaleStride = 32;
1351
0
        costShift = 2;
1352
0
        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1353
0
    }
1354
1355
0
    pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1356
0
    int predsize = scaleTuSize * scaleTuSize;
1357
1358
0
    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1359
1360
    /* there are three cost tiers for intra modes:
1361
     *  pred[0]          - mode probable, least cost
1362
     *  pred[1], pred[2] - less probable, slightly more cost
1363
     *  non-mpm modes    - all cost the same (rbits) */
1364
0
    uint64_t mpms;
1365
0
    uint32_t mpmModes[3];
1366
0
    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1367
1368
    // DC
1369
0
    primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1370
0
    bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1371
0
    bmode = mode = DC_IDX;
1372
0
    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1373
0
    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1374
1375
    // PLANAR
1376
0
    pixel* planar = intraNeighbourBuf[0];
1377
0
    if (tuSize & (8 | 16 | 32))
1378
0
        planar = intraNeighbourBuf[1];
1379
1380
0
    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
1381
0
    sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1382
0
    mode = PLANAR_IDX;
1383
0
    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1384
0
    cost = m_rdCost.calcRdSADCost(sad, bits);
1385
0
    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1386
1387
0
    bool allangs = true;
1388
0
    if (primitives.cu[sizeIdx].intra_pred_allangs)
1389
0
    {
1390
0
        primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1391
0
        primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
1392
0
    }
1393
0
    else
1394
0
        allangs = false;
1395
1396
0
#define TRY_ANGLE(angle) \
1397
0
    if (allangs) { \
1398
0
        if (angle < 18) \
1399
0
            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1400
0
        else \
1401
0
            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1402
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1403
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1404
0
    } else { \
1405
0
        int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
1406
0
        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
1407
0
        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
1408
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1409
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1410
0
    }
1411
1412
0
    if (m_param->bEnableFastIntra)
1413
0
    {
1414
0
        int asad = 0;
1415
0
        uint32_t lowmode, highmode, amode = 5, abits = 0;
1416
0
        uint64_t acost = MAX_INT64;
1417
1418
        /* pick the best angle, sampling at distance of 5 */
1419
0
        for (mode = 5; mode < 35; mode += 5)
1420
0
        {
1421
0
            TRY_ANGLE(mode);
1422
0
            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1423
0
        }
1424
1425
        /* refine best angle at distance 2, then distance 1 */
1426
0
        for (uint32_t dist = 2; dist >= 1; dist--)
1427
0
        {
1428
0
            lowmode = amode - dist;
1429
0
            highmode = amode + dist;
1430
1431
0
            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1432
0
            TRY_ANGLE(lowmode);
1433
0
            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1434
1435
0
            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1436
0
            TRY_ANGLE(highmode);
1437
0
            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1438
0
        }
1439
1440
0
        if (amode == 33)
1441
0
        {
1442
0
            TRY_ANGLE(34);
1443
0
            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1444
0
        }
1445
1446
0
        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1447
0
    }
1448
0
    else // calculate and search all intra prediction angles for lowest cost
1449
0
    {
1450
0
        for (mode = 2; mode < 35; mode++)
1451
0
        {
1452
0
            TRY_ANGLE(mode);
1453
0
            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1454
0
        }
1455
0
    }
1456
1457
0
    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1458
0
    intraMode.initCosts();
1459
0
    intraMode.totalBits = bbits;
1460
0
    intraMode.distortion = bsad;
1461
0
    intraMode.sa8dCost = bcost;
1462
0
    intraMode.sa8dBits = bbits;
1463
0
}
1464
1465
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1466
0
{
1467
0
    ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1468
1469
0
    CUData& cu = intraMode.cu;
1470
0
    Yuv* reconYuv = &intraMode.reconYuv;
1471
1472
0
    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1473
0
    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1474
1475
0
    uint32_t tuDepthRange[2];
1476
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1477
1478
0
    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1479
1480
0
    Cost icosts;
1481
0
    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1482
0
    extractIntraResultQT(cu, *reconYuv, 0, 0);
1483
1484
0
    intraMode.lumaDistortion = icosts.distortion;
1485
0
    if (m_csp != X265_CSP_I400)
1486
0
    {
1487
0
        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
1488
0
        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
1489
0
    }
1490
0
    else
1491
0
        intraMode.distortion = intraMode.lumaDistortion;
1492
1493
0
    m_entropyCoder.resetBits();
1494
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1495
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1496
0
    m_entropyCoder.codeSkipFlag(cu, 0);
1497
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1498
0
    m_entropyCoder.codePredMode(cu.m_predMode[0]);
1499
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1500
0
    m_entropyCoder.codePredInfo(cu, 0);
1501
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1502
1503
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1504
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1505
1506
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1507
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1508
0
    const Yuv* fencYuv = intraMode.fencYuv;
1509
0
    if (m_rdCost.m_psyRd)
1510
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1511
0
    else if(m_rdCost.m_ssimRd)
1512
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1513
1514
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1515
0
    m_entropyCoder.store(intraMode.contexts);
1516
0
    updateModeCost(intraMode);
1517
0
    checkDQP(intraMode, cuGeom);
1518
0
}
1519
1520
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
1521
0
{
1522
0
    CUData& cu = intraMode.cu;
1523
0
    Yuv* reconYuv = &intraMode.reconYuv;
1524
0
    Yuv* predYuv = &intraMode.predYuv;
1525
0
    const Yuv* fencYuv = intraMode.fencYuv;
1526
1527
0
    uint32_t depth        = cuGeom.depth;
1528
0
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
1529
0
    uint32_t numPU        = 1 << (2 * initTuDepth);
1530
0
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
1531
0
    uint32_t tuSize       = 1 << log2TrSize;
1532
0
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1533
0
    uint32_t sizeIdx      = log2TrSize - 2;
1534
0
    uint32_t absPartIdx   = 0;
1535
0
    sse_t totalDistortion = 0;
1536
1537
0
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1538
1539
    // loop over partitions
1540
0
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1541
0
    {
1542
0
        uint32_t bmode = 0;
1543
1544
0
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
1545
0
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
1546
0
        else
1547
0
        {
1548
0
            uint64_t candCostList[MAX_RD_INTRA_MODES];
1549
0
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
1550
0
            uint64_t bcost;
1551
0
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1552
1553
0
            {
1554
0
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1555
1556
                // Reference sample smoothing
1557
0
                IntraNeighbors intraNeighbors;
1558
0
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1559
0
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1560
1561
                // determine set of modes to be tested (using prediction signal only)
1562
0
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1563
0
                uint32_t stride = predYuv->m_size;
1564
1565
0
                int scaleTuSize = tuSize;
1566
0
                int scaleStride = stride;
1567
0
                int costShift = 0;
1568
1569
0
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1570
1571
                /* there are three cost tiers for intra modes:
1572
                *  pred[0]          - mode probable, least cost
1573
                *  pred[1], pred[2] - less probable, slightly more cost
1574
                *  non-mpm modes    - all cost the same (rbits) */
1575
0
                uint64_t mpms;
1576
0
                uint32_t mpmModes[3];
1577
0
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1578
1579
0
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1580
0
                uint64_t modeCosts[35];
1581
1582
                // DC
1583
0
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1584
0
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
1585
0
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1586
0
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1587
1588
                // PLANAR
1589
0
                pixel* planar = intraNeighbourBuf[0];
1590
0
                if (tuSize >= 8 && tuSize <= 32)
1591
0
                    planar = intraNeighbourBuf[1];
1592
1593
0
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
1594
0
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
1595
0
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1596
0
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1597
0
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1598
1599
                // angular predictions
1600
0
                if (primitives.cu[sizeIdx].intra_pred_allangs)
1601
0
                {
1602
0
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1603
0
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
1604
0
                    for (int mode = 2; mode < 35; mode++)
1605
0
                    {
1606
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1607
0
                        if (mode < 18)
1608
0
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1609
0
                        else
1610
0
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1611
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1612
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1613
0
                    }
1614
0
                }
1615
0
                else
1616
0
                {
1617
0
                    for (int mode = 2; mode < 35; mode++)
1618
0
                    {
1619
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1620
0
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
1621
0
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
1622
0
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
1623
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1624
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1625
0
                    }
1626
0
                }
1627
1628
                /* Find the top maxCandCount candidate modes with cost within 25% of best
1629
                * or among the most probable modes. maxCandCount is derived from the
1630
                * rdLevel and depth. In general we want to try more modes at slower RD
1631
                * levels and at higher depths */
1632
0
                for (int i = 0; i < maxCandCount; i++)
1633
0
                    candCostList[i] = MAX_INT64;
1634
1635
0
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
1636
0
                for (int mode = 0; mode < 35; mode++)
1637
0
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
1638
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
1639
0
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1640
0
            }
1641
1642
            /* measure best candidates using simple RDO (no TU splits) */
1643
0
            bcost = MAX_INT64;
1644
0
            for (int i = 0; i < maxCandCount; i++)
1645
0
            {
1646
0
                if (candCostList[i] == MAX_INT64)
1647
0
                    break;
1648
1649
0
                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1650
1651
0
                m_entropyCoder.load(m_rqt[depth].cur);
1652
0
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1653
1654
0
                Cost icosts;
1655
0
                if (checkTransformSkip)
1656
0
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1657
0
                else
1658
0
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1659
0
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1660
0
            }
1661
0
        }
1662
1663
0
        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1664
1665
        /* remeasure best mode, allowing TU splits */
1666
0
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
1667
0
        m_entropyCoder.load(m_rqt[depth].cur);
1668
1669
0
        Cost icosts;
1670
0
        if (checkTransformSkip)
1671
0
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1672
0
        else
1673
0
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
1674
0
        totalDistortion += icosts.distortion;
1675
1676
0
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
1677
1678
        // set reconstruction for next intra prediction blocks
1679
0
        if (puIdx != numPU - 1)
1680
0
        {
1681
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
1682
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1683
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1684
             * that the contexts should be tracked through each PU */
1685
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1686
0
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1687
0
            uint32_t dststride = reconPic->m_stride;
1688
0
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
1689
0
            uint32_t srcstride = reconYuv->m_size;
1690
0
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
1691
0
        }
1692
0
    }
1693
1694
0
    if (numPU > 1)
1695
0
    {
1696
0
        uint32_t combCbfY = 0;
1697
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1698
0
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
1699
1700
0
        cu.m_cbf[0][0] |= combCbfY;
1701
0
    }
1702
1703
    // TODO: remove this
1704
0
    m_entropyCoder.load(m_rqt[depth].cur);
1705
1706
0
    return totalDistortion;
1707
0
}
1708
1709
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
1710
0
{
1711
0
    CUData& cu = intraMode.cu;
1712
0
    const Yuv* fencYuv = intraMode.fencYuv;
1713
0
    Yuv* predYuv = &intraMode.predYuv;
1714
1715
0
    uint32_t bestMode  = 0;
1716
0
    uint64_t bestCost  = MAX_INT64;
1717
0
    uint32_t modeList[NUM_CHROMA_MODE];
1718
1719
0
    uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
1720
0
    uint32_t tuSize = 1 << log2TrSizeC;
1721
0
    uint32_t tuDepth = 0;
1722
0
    int32_t costShift = 0;
1723
1724
0
    if (tuSize > 32)
1725
0
    {
1726
0
        tuDepth = 1;
1727
0
        costShift = 2;
1728
0
        log2TrSizeC = 5;
1729
0
    }
1730
1731
0
    IntraNeighbors intraNeighbors;
1732
0
    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
1733
0
    cu.getAllowedChromaDir(0, modeList);
1734
1735
    // check chroma modes
1736
0
    for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
1737
0
    {
1738
0
        uint32_t chromaPredMode = modeList[mode];
1739
0
        if (chromaPredMode == DM_CHROMA_IDX)
1740
0
            chromaPredMode = cu.m_lumaIntraDir[0];
1741
0
        if (m_csp == X265_CSP_I422)
1742
0
            chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1743
1744
0
        uint64_t cost = 0;
1745
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1746
0
        {
1747
0
            const pixel* fenc = fencYuv->m_buf[chromaId];
1748
0
            pixel* pred = predYuv->m_buf[chromaId];
1749
0
            Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
1750
            // get prediction signal
1751
0
            predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
1752
0
            cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
1753
0
        }
1754
1755
0
        if (cost < bestCost)
1756
0
        {
1757
0
            bestCost = cost;
1758
0
            bestMode = modeList[mode];
1759
0
        }
1760
0
    }
1761
1762
0
    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
1763
0
}
1764
1765
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
1766
0
{
1767
0
    CUData& cu = intraMode.cu;
1768
0
    Yuv& reconYuv = intraMode.reconYuv;
1769
1770
0
    uint32_t depth       = cuGeom.depth;
1771
0
    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
1772
0
    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
1773
0
    uint32_t absPartStep = cuGeom.numPartitions;
1774
0
    sse_t totalDistortion = 0;
1775
1776
0
    int size = partitionFromLog2Size(log2TrSize);
1777
1778
0
    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
1779
1780
0
    do
1781
0
    {
1782
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1783
1784
0
        uint32_t bestMode = 0;
1785
0
        sse_t bestDist = 0;
1786
0
        uint64_t bestCost = MAX_INT64;
1787
1788
        // init mode list
1789
0
        uint32_t minMode = 0;
1790
0
        uint32_t maxMode = NUM_CHROMA_MODE;
1791
0
        uint32_t modeList[NUM_CHROMA_MODE];
1792
1793
0
        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
1794
0
        {
1795
0
            for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
1796
0
                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
1797
0
            maxMode = 1;
1798
0
        }
1799
0
        else
1800
0
            cu.getAllowedChromaDir(absPartIdxC, modeList);
1801
1802
0
        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
1803
0
        {
1804
0
            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
1805
0
                modeList[l] = modeList[0];
1806
0
            maxMode = 1;
1807
0
        }
1808
        // check chroma modes
1809
0
        for (uint32_t mode = minMode; mode < maxMode; mode++)
1810
0
        {
1811
            // restore context models
1812
0
            m_entropyCoder.load(m_rqt[depth].cur);
1813
1814
0
            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
1815
0
            Cost outCost;
1816
0
            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
1817
1818
0
            if (m_slice->m_pps->bTransformSkipEnabled)
1819
0
                m_entropyCoder.load(m_rqt[depth].cur);
1820
1821
0
            m_entropyCoder.resetBits();
1822
            // chroma prediction mode
1823
0
            if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
1824
0
            {
1825
0
                if (!absPartIdxC)
1826
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1827
0
            }
1828
0
            else
1829
0
            {
1830
0
                uint32_t qNumParts = cuGeom.numPartitions >> 2;
1831
0
                if (!(absPartIdxC & (qNumParts - 1)))
1832
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1833
0
            }
1834
1835
0
            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
1836
0
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
1837
0
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
1838
0
            uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1839
0
            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
1840
0
                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
1841
1842
0
            if (cost < bestCost)
1843
0
            {
1844
0
                bestCost = cost;
1845
0
                bestDist = outCost.distortion;
1846
0
                bestMode = modeList[mode];
1847
0
                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
1848
0
                memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1849
0
                memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1850
0
                memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1851
0
                memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1852
0
            }
1853
0
        }
1854
1855
0
        if (!tuIterator.isLastSection())
1856
0
        {
1857
0
            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
1858
0
            PicYuv*  reconPic  = m_frame->m_reconPic[0];
1859
0
            uint32_t dststride = reconPic->m_strideC;
1860
0
            const pixel* src;
1861
0
            pixel* dst;
1862
1863
0
            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
1864
0
            src = reconYuv.getCbAddr(absPartIdxC);
1865
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
1866
1867
0
            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
1868
0
            src = reconYuv.getCrAddr(absPartIdxC);
1869
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
1870
0
        }
1871
1872
0
        memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1873
0
        memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1874
0
        memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1875
0
        memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1876
0
        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
1877
0
        totalDistortion += bestDist;
1878
0
    }
1879
0
    while (tuIterator.isNextSection());
1880
1881
0
    if (initTuDepth != 0)
1882
0
    {
1883
0
        uint32_t combCbfU = 0;
1884
0
        uint32_t combCbfV = 0;
1885
0
        uint32_t qNumParts = tuIterator.absPartIdxStep;
1886
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1887
0
        {
1888
0
            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
1889
0
            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
1890
0
        }
1891
1892
0
        cu.m_cbf[1][0] |= combCbfU;
1893
0
        cu.m_cbf[2][0] |= combCbfV;
1894
0
    }
1895
1896
    /* TODO: remove this */
1897
0
    m_entropyCoder.load(m_rqt[depth].cur);
1898
0
    return totalDistortion;
1899
0
}
1900
1901
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
1902
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
1903
0
{
1904
0
    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
1905
1906
0
    MVField  candMvField[MRG_MAX_NUM_CANDS][2];
1907
0
    uint8_t  candDir[MRG_MAX_NUM_CANDS];
1908
0
    uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
1909
#if ENABLE_SCC_EXT
1910
    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
1911
#else
1912
0
    if (cu.isBipredRestriction())
1913
0
    {
1914
        /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
1915
0
        for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
1916
0
        {
1917
0
            if (candDir[mergeCand] == 3)
1918
0
            {
1919
0
                candDir[mergeCand] = 1;
1920
0
                candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
1921
0
            }
1922
0
        }
1923
0
    }
1924
0
#endif
1925
1926
0
    Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1927
1928
0
    uint32_t outCost = MAX_UINT;
1929
0
    for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
1930
0
    {
1931
        /* Prevent TMVP candidates from using unavailable reference pixels */
1932
0
        if (m_bFrameParallel)
1933
0
        {
1934
            // Parallel slices bound check
1935
0
            if (m_param->maxSlices > 1)
1936
0
            {
1937
0
                if (cu.m_bFirstRowInSlice &
1938
0
                    ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
1939
0
                    continue;
1940
1941
                // Last row in slice can't reference beyond bound since it is another slice area
1942
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
1943
0
                if (cu.m_bLastRowInSlice &&
1944
0
                    ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
1945
0
                    continue;
1946
0
            }
1947
1948
0
            if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1949
0
                candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
1950
0
                continue;
1951
0
        }
1952
1953
#if ENABLE_SCC_EXT
1954
        if ((candDir[mergeCand] == 1 || candDir[mergeCand] == 3) && (m_slice->m_refPOCList[0][candMvField[mergeCand][0].refIdx] == m_slice->m_poc))
1955
        {
1956
            continue;
1957
        }
1958
#endif
1959
0
        cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
1960
0
        cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
1961
0
        cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
1962
0
        cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
1963
1964
0
        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
1965
1966
0
        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
1967
0
        if (m_me.bChromaSATD)
1968
0
            costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
1969
1970
0
        uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
1971
0
        costCand = costCand + m_rdCost.getCost(bitsCand);
1972
0
        if (costCand < outCost)
1973
0
        {
1974
0
            outCost = costCand;
1975
0
            m.bits = bitsCand;
1976
0
            m.index = mergeCand;
1977
0
        }
1978
0
    }
1979
1980
0
    m.mvField[0] = candMvField[m.index][0];
1981
0
    m.mvField[1] = candMvField[m.index][1];
1982
0
    m.dir = candDir[m.index];
1983
1984
0
    return outCost;
1985
0
}
1986
1987
/* find the lowres motion vector from lookahead in middle of current PU */
1988
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
1989
0
{
1990
0
    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
1991
0
    if (diffPoc > m_param->bframes + 1)
1992
        /* poc difference is out of range for lookahead */
1993
0
        return 0;
1994
1995
0
    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc];
1996
0
    if (mvs[0].x == 0x7FFF)
1997
        /* this motion search was not estimated by lookahead */
1998
0
        return 0;
1999
2000
0
    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
2001
0
    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
2002
0
    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
2003
2004
0
    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
2005
0
    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
2006
2007
0
    return mvs[idx] << 1; /* scale up lowres mv */
2008
0
}
2009
2010
/* Pick between the two AMVP candidates which is the best one to use as
2011
 * MVP for the motion search, based on SAD cost */
2012
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
2013
0
{
2014
0
    if (amvp[0] == amvp[1])
2015
0
        return 0;
2016
2017
0
    Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
2018
0
    uint32_t costs[AMVP_NUM_CANDS];
2019
2020
0
    for (int i = 0; i < AMVP_NUM_CANDS; i++)
2021
0
    {
2022
0
        MV mvCand = amvp[i];
2023
2024
        // NOTE: skip mvCand if Y is > merange and -FN>1
2025
0
        if (m_bFrameParallel)
2026
0
        {
2027
0
            costs[i] = m_me.COST_MAX;
2028
2029
0
            if (mvCand.y >= (m_param->searchRange + 1) * 4)
2030
0
                continue;
2031
2032
0
            if ((m_param->maxSlices > 1) &
2033
0
                ((mvCand.y < m_sliceMinY)
2034
0
              |  (mvCand.y > m_sliceMaxY)))
2035
0
                continue;
2036
0
        }
2037
0
        cu.clipMv(mvCand);
2038
#if ENABLE_SCC_EXT
2039
        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx[0] - 1)
2040
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameList[list][ref]->m_reconPic[1], mvCand);
2041
        else
2042
#endif
2043
0
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
2044
0
        costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
2045
0
    }
2046
2047
0
    return (costs[0] <= costs[1]) ? 0 : 1;
2048
0
}
2049
2050
void Search::PME::processTasks(int workerThreadId)
2051
0
{
2052
#if DETAILED_CU_STATS
2053
    int fe = mode.cu.m_encData->m_frameEncoderID;
2054
    master.m_stats[fe].countPMETasks++;
2055
    ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
2056
#endif
2057
0
    ProfileScopeEvent(pme);
2058
0
    master.processPME(*this, master.m_tld[workerThreadId].analysis);
2059
0
}
2060
2061
void Search::processPME(PME& pme, Search& slave)
2062
0
{
2063
    /* acquire a motion estimation job, else exit early */
2064
0
    int meId;
2065
0
    pme.m_lock.acquire();
2066
0
    if (pme.m_jobTotal > pme.m_jobAcquired)
2067
0
    {
2068
0
        meId = pme.m_jobAcquired++;
2069
0
        pme.m_lock.release();
2070
0
    }
2071
0
    else
2072
0
    {
2073
0
        pme.m_lock.release();
2074
0
        return;
2075
0
    }
2076
2077
    /* Setup slave Search instance for ME for master's CU */
2078
0
    if (&slave != this)
2079
0
    {
2080
0
        slave.m_slice = m_slice;
2081
0
        slave.m_frame = m_frame;
2082
0
        slave.m_param = m_param;
2083
0
        slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
2084
0
        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
2085
0
        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
2086
0
    }
2087
2088
    /* Perform ME, repeat until no more work is available */
2089
0
    do
2090
0
    {
2091
0
        if (meId < pme.m_jobs.refCnt[0])
2092
0
        {
2093
0
            int refIdx = pme.m_jobs.ref[0][meId]; //L0
2094
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
2095
0
        }
2096
0
        else
2097
0
        {
2098
0
            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
2099
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
2100
0
        }
2101
2102
0
        meId = -1;
2103
0
        pme.m_lock.acquire();
2104
0
        if (pme.m_jobTotal > pme.m_jobAcquired)
2105
0
            meId = pme.m_jobAcquired++;
2106
0
        pme.m_lock.release();
2107
0
    }
2108
0
    while (meId >= 0);
2109
0
}
2110
2111
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
2112
0
{
2113
0
    uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
2114
0
    int numIdx = m_slice->m_numRefIdx[list];
2115
#if ENABLE_SCC_EXT
2116
    if (!list && m_ibcEnabled)
2117
        numIdx--;
2118
#endif
2119
0
    bits += getTUBits(ref, numIdx);
2120
2121
0
    MotionData* bestME = interMode.bestME[part];
2122
2123
    // 12 mv candidates including lowresMV
2124
0
    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2125
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2126
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, 0, pu.puAbsPartIdx);
2127
#else
2128
0
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2129
0
#endif
2130
2131
0
    const MV* amvp = interMode.amvpCand[list][ref];
2132
0
    int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
2133
0
    bool bLowresMVP = false;
2134
0
    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2135
2136
0
    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging if lowresMV is not available */
2137
0
    {
2138
0
        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
2139
0
        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2140
0
        if (lmv.notZero() && !layer)
2141
0
            mvc[numMvc++] = lmv;
2142
0
        if (m_param->bEnableHME)
2143
0
            mvp_lowres = lmv;
2144
0
    }
2145
2146
0
    m_vertRestriction = interMode.cu.m_slice->m_refPOCList[list][ref] == interMode.cu.m_slice->m_poc;
2147
0
    setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
2148
2149
0
    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2150
0
      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2151
2152
0
    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2153
0
    {
2154
0
        MV outmv_lowres;
2155
0
        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2156
0
        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2157
0
            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2158
0
        if (lowresMvCost < satdCost)
2159
0
        {
2160
0
            outmv = outmv_lowres;
2161
0
            satdCost = lowresMvCost;
2162
0
            bLowresMVP = true;
2163
0
        }
2164
0
    }
2165
    /* Get total cost of partition, but only include MV bit cost once */
2166
0
    bits += m_me.bitcost(outmv);
2167
0
    uint32_t mvCost = m_me.mvcost(outmv);
2168
0
    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2169
2170
    /* Update LowresMVP to best AMVP cand*/
2171
0
    if (bLowresMVP)
2172
0
        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2173
2174
    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2175
0
    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2176
2177
    /* tie goes to the smallest ref ID, just like --no-pme */
2178
0
    ScopedLock _lock(master.m_meLock);
2179
0
    if (cost < bestME[list].cost ||
2180
0
       (cost == bestME[list].cost && ref < bestME[list].ref))
2181
0
    {
2182
0
        bestME[list].mv = outmv;
2183
0
        bestME[list].mvp = mvp;
2184
0
        bestME[list].mvpIdx = mvpIdx;
2185
0
        bestME[list].ref = ref;
2186
0
        bestME[list].cost = cost;
2187
0
        bestME[list].bits = bits;
2188
0
        bestME[list].mvCost  = mvCost;
2189
0
    }
2190
0
}
2191
void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc)
2192
0
{
2193
0
    CUData& cu = interMode.cu;
2194
0
    MV mv, mvmin, mvmax;
2195
0
    int cand = 0, bestcost = INT_MAX;
2196
0
    while (cand < m_param->mvRefine)
2197
0
    {
2198
0
        if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1])))
2199
0
        {
2200
0
            cand++;
2201
0
            continue;
2202
0
        }
2203
0
        MV bestMV;
2204
0
        mv = mvp[cand++];
2205
0
        cu.clipMv(mv);
2206
0
        m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2207
0
        setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
2208
0
        int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices, m_vertRestriction,
2209
0
        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2210
0
        if (bestcost > cost)
2211
0
        {
2212
0
            bestcost = cost;
2213
0
            outmv = bestMV;
2214
0
        }
2215
0
    }
2216
0
}
2217
/* find the best inter prediction for each PU of specified mode */
2218
#if ENABLE_SCC_EXT
2219
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2], MV* iMVCandList)
2220
#else
2221
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
2222
#endif
2223
0
{
2224
0
    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
2225
2226
0
    CUData& cu = interMode.cu;
2227
0
    Yuv* predYuv = &interMode.predYuv;
2228
2229
    // 12 mv candidates including lowresMV
2230
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2231
2232
0
    const Slice *slice = m_slice;
2233
0
    int numPart     = cu.getNumPartInter(0);
2234
0
    int numPredDir  = slice->isInterP() ? 1 : 2;
2235
0
    const int* numRefIdx = slice->m_numRefIdx;
2236
0
    uint32_t lastMode = 0;
2237
0
    int      totalmebits = 0;
2238
0
    MV       mvzero(0, 0);
2239
0
    Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2240
0
    MergeData merge;
2241
0
    memset(&merge, 0, sizeof(merge));
2242
0
    bool useAsMVP = false;
2243
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
2244
0
    {
2245
0
        MotionData* bestME = interMode.bestME[puIdx];
2246
0
        PredictionUnit pu(cu, cuGeom, puIdx);
2247
0
        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
2248
0
        useAsMVP = false;
2249
0
        x265_analysis_inter_data* interDataCTU = NULL;
2250
0
        int cuIdx;
2251
0
        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
2252
0
        if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1)
2253
0
        {
2254
0
            interDataCTU = m_frame->m_analysisData.interData;
2255
0
            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
2256
0
                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
2257
0
                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
2258
0
                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
2259
0
                useAsMVP = true;
2260
0
        }
2261
        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
2262
0
        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
2263
0
        bestME[0].cost = MAX_UINT;
2264
0
        bestME[1].cost = MAX_UINT;
2265
2266
0
        getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
2267
0
        bool bDoUnidir = true;
2268
2269
0
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
2270
        /* Uni-directional prediction */
2271
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2272
0
            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
2273
0
        {
2274
0
            for (int list = 0; list < numPredDir; list++)
2275
0
            {
2276
2277
0
                int ref = -1;
2278
0
                if (useAsMVP)
2279
0
                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
2280
0
                else
2281
0
                    ref = bestME[list].ref;
2282
0
                if (ref < 0)
2283
0
                {
2284
0
                    continue;
2285
0
                }
2286
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2287
0
                int numIdx = m_slice->m_numRefIdx[list];
2288
#if ENABLE_SCC_EXT
2289
                if (!list && m_ibcEnabled)
2290
                    numIdx--;
2291
#endif
2292
0
                bits += getTUBits(ref, numIdx);
2293
2294
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2295
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2296
#else
2297
0
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2298
0
#endif
2299
0
                const MV* amvp = interMode.amvpCand[list][ref];
2300
0
                int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2301
0
                MV mvmin, mvmax, outmv, mvp;
2302
0
                if (useAsMVP)
2303
0
                {
2304
0
                    mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
2305
0
                    mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
2306
0
                }
2307
0
                else
2308
0
                    mvp = amvp[mvpIdx];
2309
0
                if (m_param->searchMethod == X265_SEA)
2310
0
                {
2311
0
                    int puX = puIdx & 1;
2312
0
                    int puY = puIdx >> 1;
2313
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2314
0
                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2315
0
                }
2316
0
                setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2317
0
                MV mvpIn = mvp;
2318
0
                int satdCost;
2319
0
                if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
2320
0
                    mvpIn = bestME[list].mv;
2321
0
                if (useAsMVP && m_param->mvRefine > 1)
2322
0
                {
2323
0
                    MV bestmv, mvpSel[3];
2324
0
                    int mvpIdxSel[3];
2325
0
                    satdCost = m_me.COST_MAX;
2326
0
                    mvpSel[0] = mvp;
2327
0
                    mvpIdxSel[0] = mvpIdx;
2328
0
                    mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2329
0
                    mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
2330
0
                    mvpIdxSel[1] = mvpIdx;
2331
0
                    if (m_param->mvRefine > 2)
2332
0
                    {
2333
0
                        mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
2334
0
                        mvpIdxSel[2] = !mvpIdx;
2335
0
                    }
2336
0
                    for (int cand = 0; cand < m_param->mvRefine; cand++)
2337
0
                    {
2338
0
                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
2339
0
                            continue;
2340
0
                        setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);
2341
0
                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices, m_vertRestriction,
2342
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2343
0
                        if (satdCost > bcost)
2344
0
                        {
2345
0
                            satdCost = bcost;
2346
0
                            outmv = bestmv;
2347
0
                            mvp = mvpSel[cand];
2348
0
                            mvpIdx = mvpIdxSel[cand];
2349
0
                        }
2350
0
                    }
2351
0
                    mvpIn = mvp;
2352
0
                }
2353
0
                else
2354
0
                {
2355
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2356
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2357
0
                }
2358
2359
                /* Get total cost of partition, but only include MV bit cost once */
2360
0
                bits += m_me.bitcost(outmv);
2361
0
                uint32_t mvCost = m_me.mvcost(outmv);
2362
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2363
                /* Refine MVP selection, updates: mvpIdx, bits, cost */
2364
0
                if (!(m_param->analysisMultiPassRefine || useAsMVP))
2365
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2366
0
                else
2367
0
                {
2368
                    /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here 
2369
                      the actual mvp is bestME from pass 1 for that mvpIdx */
2370
0
                    int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
2371
0
                    if (diffBits < 0)
2372
0
                    {
2373
0
                        mvpIdx = !mvpIdx;
2374
0
                        uint32_t origOutBits = bits;
2375
0
                        bits = origOutBits + diffBits;
2376
0
                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
2377
0
                    }
2378
0
                    mvp = amvp[mvpIdx];
2379
0
                }
2380
2381
0
                if (cost < bestME[list].cost)
2382
0
                {
2383
0
                    bestME[list].mv = outmv;
2384
0
                    bestME[list].mvp = mvp;
2385
0
                    bestME[list].mvpIdx = mvpIdx;
2386
0
                    bestME[list].cost = cost;
2387
0
                    bestME[list].bits = bits;
2388
0
                    bestME[list].mvCost  = mvCost;
2389
0
                    bestME[list].ref = ref;
2390
0
                }
2391
0
                bDoUnidir = false;
2392
0
            }            
2393
0
        }
2394
0
        else if (m_param->bDistributeMotionEstimation)
2395
0
        {
2396
0
            PME pme(*this, interMode, cuGeom, pu, puIdx);
2397
0
            pme.m_jobTotal = 0;
2398
0
            pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
2399
2400
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2401
0
            for (int list = 0; list < numPredDir; list++)
2402
0
            {
2403
0
                int idx = 0;
2404
0
                int numIdx = numRefIdx[list];
2405
#if ENABLE_SCC_EXT
2406
                if (!list && m_ibcEnabled)
2407
                    numIdx--;
2408
#endif
2409
0
                for (int ref = 0; ref < numIdx; ref++)
2410
0
                {
2411
0
                    if (!(refMask & (1 << ref)))
2412
0
                        continue;
2413
2414
0
                    pme.m_jobs.ref[list][idx++]  = ref;
2415
0
                    pme.m_jobTotal++;
2416
0
                }
2417
0
                pme.m_jobs.refCnt[list] = idx;
2418
2419
                /* the second list ref bits start at bit 16 */
2420
0
                refMask >>= 16;
2421
0
            }
2422
2423
0
            if (pme.m_jobTotal > 2)
2424
0
            {
2425
0
                pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
2426
2427
0
                processPME(pme, *this);
2428
2429
0
                int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
2430
0
                singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
2431
2432
0
                bDoUnidir = false;
2433
2434
0
                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
2435
0
                pme.waitForExit();
2436
0
            }
2437
2438
            /* if no peer threads were bonded, fall back to doing unidirectional
2439
             * searches ourselves without overhead of singleMotionEstimation() */
2440
0
        }
2441
0
        if (bDoUnidir)
2442
0
        {
2443
0
            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
2444
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2445
2446
0
            for (int list = 0; list < numPredDir; list++)
2447
0
            {
2448
0
                int numIdx = numRefIdx[list];
2449
#if ENABLE_SCC_EXT
2450
                if (!list && m_ibcEnabled)
2451
                    numIdx--;
2452
#endif
2453
0
                for (int ref = 0; ref < numIdx; ref++)
2454
0
                {
2455
0
                    ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
2456
2457
0
                    if (!(refMask & (1 << ref)))
2458
0
                    {
2459
0
                        ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
2460
0
                        continue;
2461
0
                    }
2462
2463
0
                    uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2464
0
                    bits += getTUBits(ref, numIdx);
2465
2466
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2467
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2468
#else
2469
0
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2470
0
#endif
2471
2472
0
                    const MV* amvp = interMode.amvpCand[list][ref];
2473
0
                    int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2474
0
                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2475
0
                    bool bLowresMVP = false;
2476
2477
0
                    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging when lowresMV is not available */
2478
0
                    {
2479
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
2480
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2481
0
                        if (lmv.notZero() && !layer)
2482
0
                            mvc[numMvc++] = lmv;
2483
0
                        if (m_param->bEnableHME)
2484
0
                            mvp_lowres = lmv;
2485
0
                    }
2486
0
                    if (m_param->searchMethod == X265_SEA)
2487
0
                    {
2488
0
                        int puX = puIdx & 1;
2489
0
                        int puY = puIdx >> 1;
2490
0
                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2491
0
                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2492
0
                    }
2493
0
                    m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2494
0
                    setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2495
0
                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2496
0
                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2497
2498
0
                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2499
0
                    {
2500
0
                        MV outmv_lowres;
2501
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2502
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2503
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2504
0
                        if (lowresMvCost < satdCost)
2505
0
                        {
2506
0
                            outmv = outmv_lowres;
2507
0
                            satdCost = lowresMvCost;
2508
0
                            bLowresMVP = true;
2509
0
                        }
2510
0
                    }
2511
2512
                    /* Get total cost of partition, but only include MV bit cost once */
2513
0
                    bits += m_me.bitcost(outmv);
2514
0
                    uint32_t mvCost = m_me.mvcost(outmv);
2515
0
                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2516
                    /* Update LowresMVP to best AMVP cand*/
2517
0
                    if (bLowresMVP)
2518
0
                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2519
2520
                    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2521
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2522
2523
#if ENABLE_SCC_EXT
2524
                    if (m_param->bEnableSCC && (list <= 1 && ref <= 1 && (cu.m_partSize[0] == SIZE_2NxN || cu.m_partSize[0] == SIZE_Nx2N) && (1 << cu.m_log2CUSize[0]) <= 16))
2525
                    {
2526
                        iMVCandList[4 * list + 2 * ref + puIdx] = outmv;
2527
                    }
2528
#endif
2529
2530
0
                    if (cost < bestME[list].cost)
2531
0
                    {
2532
0
                        bestME[list].mv      = outmv;
2533
0
                        bestME[list].mvp     = mvp;
2534
0
                        bestME[list].mvpIdx  = mvpIdx;
2535
0
                        bestME[list].ref     = ref;
2536
0
                        bestME[list].cost    = cost;
2537
0
                        bestME[list].bits    = bits;
2538
0
                        bestME[list].mvCost  = mvCost;
2539
0
                    }
2540
0
                }
2541
                /* the second list ref bits start at bit 16 */
2542
0
                refMask >>= 16;
2543
0
            }
2544
0
        }
2545
2546
        /* Bi-directional prediction */
2547
0
        MotionData bidir[2];
2548
0
        uint32_t bidirCost = MAX_UINT;
2549
0
        int bidirBits = 0;
2550
2551
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
2552
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
2553
0
            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
2554
0
        {
2555
0
            bidir[0] = bestME[0];
2556
0
            bidir[1] = bestME[1];
2557
2558
0
            int satdCost;
2559
2560
0
            if (m_me.bChromaSATD)
2561
0
            {
2562
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
2563
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2564
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
2565
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2566
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
2567
2568
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2569
0
                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2570
0
            }
2571
0
            else
2572
0
            {
2573
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
2574
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
2575
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2576
2577
                /* Generate reference subpels */
2578
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
2579
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
2580
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
2581
0
                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
2582
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2583
0
            }
2584
2585
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2586
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2587
2588
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2589
0
            if (bTryZero)
2590
0
            {
2591
                /* Do not try zero MV if unidir motion predictors are beyond
2592
                 * valid search area */
2593
0
                MV mvmin, mvmax;
2594
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2595
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2596
0
                mvmax.y += 2; // there is some pad for subpel refine
2597
0
                mvmin <<= 2;
2598
0
                mvmax <<= 2;
2599
2600
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2601
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2602
0
            }
2603
0
            if (bTryZero)
2604
0
            {
2605
                /* coincident blocks of the two reference pictures */
2606
0
                if (m_me.bChromaSATD)
2607
0
                {
2608
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
2609
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2610
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
2611
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2612
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
2613
2614
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2615
0
                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2616
0
                }
2617
0
                else
2618
0
                {
2619
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2620
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2621
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
2622
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
2623
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2624
0
                }
2625
0
                MV mvp0 = bestME[0].mvp;
2626
0
                int mvpIdx0 = bestME[0].mvpIdx;
2627
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
2628
2629
0
                MV mvp1 = bestME[1].mvp;
2630
0
                int mvpIdx1 = bestME[1].mvpIdx;
2631
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
2632
2633
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
2634
2635
                /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2636
0
                mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
2637
0
                mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
2638
2639
0
                if (cost < bidirCost)
2640
0
                {
2641
0
                    bidir[0].mv = mvzero;
2642
0
                    bidir[1].mv = mvzero;
2643
0
                    bidir[0].mvp = mvp0;
2644
0
                    bidir[1].mvp = mvp1;
2645
0
                    bidir[0].mvpIdx = mvpIdx0;
2646
0
                    bidir[1].mvpIdx = mvpIdx1;
2647
0
                    bidirCost = cost;
2648
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2649
0
                }
2650
0
            }
2651
0
        }
2652
2653
        /* select best option and store into CU */
2654
0
        if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
2655
0
        {
2656
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
2657
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
2658
0
            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
2659
0
            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
2660
0
            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
2661
0
            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
2662
0
            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
2663
2664
0
            totalmebits += merge.bits;
2665
0
        }
2666
0
        else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
2667
0
        {
2668
0
            lastMode = 2;
2669
2670
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
2671
0
            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
2672
0
            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
2673
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
2674
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
2675
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
2676
2677
0
            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
2678
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
2679
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
2680
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
2681
2682
0
            totalmebits += bidirBits;
2683
0
        }
2684
0
        else if (bestME[0].cost <= bestME[1].cost)
2685
0
        {
2686
0
            lastMode = 0;
2687
2688
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
2689
0
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
2690
0
            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
2691
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
2692
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
2693
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
2694
2695
0
            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
2696
0
            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
2697
2698
0
            totalmebits += bestME[0].bits;
2699
0
        }
2700
0
        else
2701
0
        {
2702
0
            lastMode = 1;
2703
2704
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
2705
0
            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
2706
0
            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
2707
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
2708
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
2709
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
2710
2711
0
            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
2712
0
            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
2713
2714
0
            totalmebits += bestME[1].bits;
2715
0
        }
2716
2717
0
        motionCompensation(cu, pu, *predYuv, true, bChromaMC);
2718
0
    }
2719
0
    interMode.sa8dBits += totalmebits;
2720
0
}
2721
2722
#if ENABLE_SCC_EXT
2723
uint32_t Search::getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height)
2724
{
2725
    uint32_t dist = 0;
2726
2727
    for (int i = 0; i < height; i++)
2728
    {
2729
        for (int j = 0; j < width; j++)
2730
        {
2731
            dist += abs(ref[j] - curr[j]);
2732
        }
2733
        ref += refStride;
2734
        curr += currStride;
2735
    }
2736
    return dist;
2737
}
2738
2739
int Search::intraBCSearchMVChromaRefine(Mode& intraBCMode,
2740
    const CUGeom& cuGeom,
2741
    int         roiWidth,
2742
    int         roiHeight,
2743
    int         cuPelX,
2744
    int         cuPelY,
2745
    uint32_t* sadBestCand,
2746
    MV* MVCand,
2747
    uint32_t    partOffset,
2748
    int         puIdx
2749
)
2750
{
2751
    int bestCandIdx = 0;
2752
    uint32_t  sadBest = UINT_MAX;
2753
    uint32_t  tempSad;
2754
2755
    pixel* ref;
2756
    const pixel* picOrg;
2757
    int refStride, orgStride;
2758
    int width, height;
2759
2760
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
2761
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
2762
2763
    CUData& cu = intraBCMode.cu;
2764
    Yuv& tmpPredYuv = intraBCMode.predYuv;
2765
    PredictionUnit pu(cu, cuGeom, puIdx);
2766
2767
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
2768
    {
2769
        if ((!MVCand[cand].x) && (!MVCand[cand].y))
2770
        {
2771
            continue;
2772
        }
2773
2774
        if (((int)(cuPelY + MVCand[cand].y + roiHeight) >= picHeight) || ((cuPelY + MVCand[cand].y) < 0))
2775
        {
2776
            continue;
2777
        }
2778
2779
        if (((int)(cuPelX + MVCand[cand].x + roiWidth) >= picWidth) || ((cuPelX + MVCand[cand].x) < 0))
2780
        {
2781
            continue;
2782
        }
2783
2784
        tempSad = sadBestCand[cand];
2785
        int bitDepths = m_param->sourceBitDepth;
2786
        MV mvQuaterPixl = MVCand[cand];
2787
        mvQuaterPixl <<= 2;
2788
        cu.setPUMv(0, mvQuaterPixl, pu.puAbsPartIdx, puIdx);
2789
        cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
2790
        cu.setPUMv(1, MV(), pu.puAbsPartIdx, puIdx);
2791
        cu.setPURefIdx(1, -1, pu.puAbsPartIdx, puIdx);
2792
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
2793
2794
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
2795
2796
        for (uint32_t ch = TEXT_CHROMA_U; ch < MAX_NUM_COMPONENT; ch++)
2797
        {
2798
            ref = m_slice->m_refFrameList[0][m_slice->m_numRefIdx[0] - 1]->m_reconPic[1]->getChromaAddr(ch, cu.m_cuAddr, cu.m_absIdxInCTU + partOffset);
2799
2800
            picOrg = intraBCMode.fencYuv->getChromaAddr(ch, partOffset);
2801
            orgStride = intraBCMode.fencYuv->m_csize;
2802
2803
            refStride = m_frame->m_reconPic[1]->m_strideC;
2804
2805
            width = roiWidth >> m_hChromaShift;
2806
            height = roiHeight >> m_vChromaShift;
2807
2808
            ref = tmpPredYuv.getChromaAddr(ch, partOffset);
2809
            refStride = tmpPredYuv.m_csize;
2810
2811
            for (int row = 0; row < height; row++)
2812
            {
2813
                for (int col = 0; col < width; col++)
2814
                {
2815
                    tempSad += ((abs(ref[col] - picOrg[col])) >> (bitDepths - 8));
2816
                }
2817
                ref += refStride;
2818
                picOrg += orgStride;
2819
            }
2820
        }
2821
2822
        if (tempSad < sadBest)
2823
        {
2824
            sadBest = tempSad;
2825
            bestCandIdx = cand;
2826
        }
2827
    }
2828
2829
    return bestCandIdx;
2830
}
2831
2832
void Search::updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc)
2833
{
2834
    if (roiWidth + roiHeight > 8)
2835
    {
2836
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvCand, CHROMA_REFINEMENT_CANDIDATES, false);
2837
2838
        if (roiWidth + roiHeight == 32)
2839
        {
2840
            ibc.m_numBV16s = ibc.m_numBVs;
2841
        }
2842
    }
2843
}
2844
2845
void Search::intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* MVCand)
2846
{
2847
    int j = CHROMA_REFINEMENT_CANDIDATES - 1;
2848
2849
    if (sad < sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
2850
    {
2851
        for (int t = CHROMA_REFINEMENT_CANDIDATES - 1; t >= 0; t--)
2852
        {
2853
            if (sad < sadBestCand[t])
2854
            {
2855
                j = t;
2856
            }
2857
        }
2858
2859
        for (int k = CHROMA_REFINEMENT_CANDIDATES - 1; k > j; k--)
2860
        {
2861
            sadBestCand[k] = sadBestCand[k - 1];
2862
2863
            MVCand[k].set(MVCand[k - 1].x, MVCand[k - 1].y);
2864
        }
2865
        sadBestCand[j] = sad;
2866
        MVCand[j].set(x, y);
2867
    }
2868
}
2869
2870
uint32_t Search::mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel)
2871
{
2872
    for (uint32_t cand = 0; cand < sn && dn < SCM_S0067_NUM_CANDIDATES; cand++)
2873
    {
2874
        bool found = false;
2875
        MV TempMv = src[cand];
2876
        if (!isSrcQuarPel)
2877
        {
2878
            TempMv <<= 2;
2879
        }
2880
        for (uint32_t j = 0; j < dn; j++)
2881
        {
2882
            if (TempMv == dst[j])
2883
            {
2884
                found = true;
2885
                break;
2886
            }
2887
        }
2888
2889
        if (!found)
2890
        {
2891
            dst[dn] = TempMv;
2892
            dn++;
2893
        }
2894
    }
2895
    return dn;
2896
}
2897
2898
void Search::restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours, uint32_t numValidMergeCand)
2899
{
2900
    {
2901
        for (uint32_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
2902
        {
2903
            if (interDirNeighbours[mergeCand] == 3)
2904
            {
2905
                bool b8x8BiPredRestricted = cu->is8x8BipredRestriction(
2906
                    mvFieldNeighbours[mergeCand][0].mv,
2907
                    mvFieldNeighbours[mergeCand][1].mv,
2908
                    mvFieldNeighbours[mergeCand][0].refIdx,
2909
                    mvFieldNeighbours[mergeCand][1].refIdx);
2910
2911
                int width = 0;
2912
                int height = 0;
2913
                uint32_t partAddr;
2914
2915
                cu->getPartIndexAndSize(puIdx, partAddr, width, height);
2916
                if (b8x8BiPredRestricted)
2917
                {
2918
                    if (width <= 8 && height <= 8)
2919
                    {
2920
                        interDirNeighbours[mergeCand] = 1;
2921
                        mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
2922
                    }
2923
                }
2924
                else if (cu->isBipredRestriction())
2925
                {
2926
                    interDirNeighbours[mergeCand] = 1;
2927
                    mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
2928
                }
2929
            }
2930
        }
2931
    }
2932
}
2933
2934
bool Search::isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* cu,
2935
    int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize)
2936
{
2937
    static const int s_floorLog2[65] =
2938
    {
2939
      -1, 0, 1, 1, 2, 2, 2, 2, 3, 3,
2940
       3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
2941
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
2942
       4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
2943
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
2944
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
2945
       5, 5, 5, 5, 6
2946
    };
2947
2948
    int ctuSizeLog2 = s_floorLog2[ctuSize];
2949
    int interpolationSamplesX = (cu->m_chromaFormat == X265_CSP_I422 || cu->m_chromaFormat == X265_CSP_I420) ? ((xBv & 0x1) << 1) : 0;
2950
    int interpolationSamplesY = (cu->m_chromaFormat == X265_CSP_I420) ? ((yBv & 0x1) << 1) : 0;
2951
    int refRightX = xPos + xBv + width - 1 + interpolationSamplesX;
2952
    int refBottomY = yPos + yBv + height - 1 + interpolationSamplesY;
2953
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
2954
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
2955
2956
    if ((xPos + xBv - interpolationSamplesX) < 0)
2957
        return false;
2958
    if (refRightX >= picWidth)
2959
        return false;
2960
    if ((yPos + yBv - interpolationSamplesY) < 0)
2961
        return false;
2962
    if (refBottomY >= picHeight)
2963
        return false;
2964
2965
    if ((xBv + width + interpolationSamplesX) > 0 && (yBv + height + interpolationSamplesY) > 0)
2966
        return false;
2967
2968
    if (refBottomY >> ctuSizeLog2 < yPos >> ctuSizeLog2)
2969
    {
2970
        int refCuX = refRightX / ctuSize;
2971
        int refCuY = refBottomY / ctuSize;
2972
        int cuPelX = xPos / ctuSize;
2973
        int cuPelY = yPos / ctuSize;
2974
2975
        if (((int)(refCuX - cuPelX) > (int)((cuPelY - refCuY))))
2976
            return false;
2977
        else
2978
            return true;
2979
    }
2980
2981
    if (refBottomY >> ctuSizeLog2 > yPos >> ctuSizeLog2)
2982
    {
2983
        return false;
2984
    }
2985
2986
    // in the same CTU line
2987
    if (refRightX >> ctuSizeLog2 < xPos >> ctuSizeLog2)
2988
        return true;
2989
    if (refRightX >> ctuSizeLog2 > xPos >> ctuSizeLog2)
2990
        return false;
2991
2992
    // same CTU
2993
    int mask = 1 << ctuSizeLog2;
2994
    mask -= 1;
2995
    int rasterCurr = ((((yPos & mask) - yStartInCU) >> 2) << (ctuSizeLog2 - 2)) + (((xPos & mask) - xStartInCU) >> 2);
2996
    int rasterRef = (((refBottomY & mask) >> 2) << (ctuSizeLog2 - 2)) + ((refRightX & mask) >> 2);
2997
2998
    if (g_rasterToZscan[rasterRef] >= g_rasterToZscan[rasterCurr])
2999
        return false;
3000
    return true;
3001
}
3002
3003
bool Search::isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset)
3004
{
3005
    const int  cuPelX = cu->m_cuPelX + g_zscanToPelX[partOffset];
3006
    const int  cuPelY = cu->m_cuPelY + g_zscanToPelY[partOffset];
3007
3008
    if (!isBlockVectorValid(cuPelX, cuPelY, roiWidth, roiHeight, cu, g_zscanToPelX[partOffset], g_zscanToPelY[partOffset], predX, predY, m_param->maxCUSize))
3009
    {
3010
        return false;
3011
    }
3012
    return true;
3013
}
3014
3015
void Search::intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
3016
    MV& mv, uint32_t& cost, int roiWidth, int roiHeight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3017
{
3018
    const int   srchRngHorLeft = searchRangeLT->x;
3019
    const int   srchRngHorRight = searchRangeRB->x;
3020
    const int   srchRngVerTop = searchRangeLT->y;
3021
    const int   srchRngVerBottom = searchRangeRB->y;
3022
3023
    CUData& cu = intraBCMode.cu;
3024
    const uint32_t  lcuWidth = m_param->maxCUSize;
3025
    const uint32_t  lcuHeight = m_param->maxCUSize;
3026
    const int       puPelOffsetX = g_zscanToPelX[partAddr];
3027
    const int       puPelOffsetY = g_zscanToPelY[partAddr];
3028
    const int       cuPelX = cu.m_cuPelX + puPelOffsetX;  // Point to the location of PU
3029
    const int       cuPelY = cu.m_cuPelY + puPelOffsetY;
3030
3031
    uint32_t  sad = 0;
3032
    uint32_t  sadBest = UINT_MAX;
3033
    int         bestX = 0;
3034
    int         bestY = 0;
3035
    pixel* refSrch;
3036
3037
    int         bestCandIdx = 0;
3038
    uint32_t    partOffset = 0;
3039
    MV          MVCand[CHROMA_REFINEMENT_CANDIDATES];
3040
    uint32_t    sadBestCand[CHROMA_REFINEMENT_CANDIDATES];
3041
3042
    partOffset = partAddr;
3043
    PredictionUnit pu(cu, cuGeom, puIdx);
3044
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3045
    {
3046
        sadBestCand[cand] = UINT_MAX;
3047
        MVCand[cand].set(0, 0);
3048
    }
3049
3050
    const int         relCUPelX = cuPelX % lcuWidth;
3051
    const int         relCUPelY = cuPelY % lcuHeight;
3052
    const int chromaROIWidthInPixels = roiWidth;
3053
    const int chromaROIHeightInPixels = roiHeight;
3054
    bool fastsearch = (m_param->bEnableSCC == 1) ? true : false;
3055
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3056
3057
    if (fastsearch)
3058
    {
3059
        uint32_t tempSadBest = 0;
3060
        int srLeft = srchRngHorLeft, srRight = srchRngHorRight, srTop = srchRngVerTop, srBottom = srchRngVerBottom;
3061
        const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3062
        const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3063
3064
        if (isFullFrameSearchrangeEnabled)//full frame search
3065
        {
3066
            srLeft = -1 * cuPelX;
3067
            srTop = -1 * cuPelY;
3068
3069
            srRight = picWidth - cuPelX - roiWidth;
3070
            srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3071
3072
            if (cuPelX + srRight + roiWidth > (int)picWidth)
3073
            {
3074
                srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3075
            }
3076
            if (cuPelY + srBottom + roiHeight > (int)picHeight)
3077
            {
3078
                srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3079
            }
3080
        }
3081
3082
        if (roiWidth > 8 || roiHeight > 8)
3083
            ibc.m_numBVs = 0;
3084
        else if (roiWidth + roiHeight == 16)
3085
            ibc.m_numBVs = ibc.m_numBV16s;
3086
        if (testOnlyPred)
3087
            ibc.m_numBVs = 0;
3088
3089
        MV  mvPredEncOnly[16];
3090
        int nbPreds = 0;
3091
        cu.getIntraBCMVPsEncOnly(partAddr, mvPredEncOnly, nbPreds, puIdx);
3092
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvPredEncOnly, nbPreds, true);
3093
3094
        for (int cand = 0; cand < ibc.m_numBVs; cand++)
3095
        {
3096
            int xPred = ibc.m_BVs[cand].x >> 2;
3097
            int yPred = ibc.m_BVs[cand].y >> 2;
3098
            if (!(xPred == 0 && yPred == 0) && !((yPred < srTop) || (yPred > srBottom)) && !((xPred < srLeft) || (xPred > srRight)))
3099
            {
3100
                int tempY = yPred + relCUPelY + roiHeight - 1;
3101
                int tempX = xPred + relCUPelX + roiWidth - 1;
3102
                bool validCand = isValidIntraBCSearchArea(&cu, xPred, yPred, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset);
3103
3104
                if ((tempX >= (int)lcuWidth) && (tempY >= 0) && isFullFrameSearchrangeEnabled)
3105
                    validCand = false;
3106
3107
                if ((tempX >= 0) && (tempY >= 0))
3108
                {
3109
                    int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3110
                    uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3111
                    if (tempZscanIdx >= cu.m_absIdxInCTU)
3112
                    {
3113
                        validCand = false;
3114
                    }
3115
                }
3116
3117
                if (validCand)
3118
                {
3119
                    sad = m_me.mvcost(ibc.m_BVs[cand]);
3120
3121
                    refSrch = refY + yPred * refStride + xPred;
3122
3123
                    sad += m_me.bufSAD(refSrch, refStride);
3124
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3125
                    {
3126
                        continue;
3127
                    }
3128
3129
                    intraBCSearchMVCandUpdate(sad, xPred, yPred, sadBestCand, MVCand);
3130
                }
3131
            }
3132
        }
3133
        bestX = MVCand[0].x;
3134
        bestY = MVCand[0].y;
3135
        mv.set(bestX, bestY);
3136
        sadBest = sadBestCand[0];
3137
3138
        if (testOnlyPred)
3139
        {
3140
            cost = sadBest;
3141
            return;
3142
        }
3143
3144
        const int boundY = (0 - roiHeight - puPelOffsetY);
3145
        int lowY = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3146
            ? -cuPelY : X265_MAX(srchRngVerTop, 0 - cuPelY);
3147
        for (int y = boundY; y >= lowY; y--)
3148
        {
3149
            if (!isValidIntraBCSearchArea(&cu, 0, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3150
            {
3151
                continue;
3152
            }
3153
3154
            sad = m_me.mvcost(MV(0, y));
3155
3156
            refSrch = refY + y * refStride;
3157
3158
            sad += m_me.bufSAD(refSrch, refStride);
3159
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3160
            {
3161
                continue;
3162
            }
3163
3164
            intraBCSearchMVCandUpdate(sad, 0, y, sadBestCand, MVCand);
3165
            tempSadBest = sadBestCand[0];
3166
            if (sadBestCand[0] <= 3)
3167
            {
3168
                bestX = MVCand[0].x;
3169
                bestY = MVCand[0].y;
3170
                sadBest = sadBestCand[0];
3171
                mv.set(bestX, bestY);
3172
                cost = sadBest;
3173
3174
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3175
                return;
3176
            }
3177
        }
3178
3179
        const int boundX = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3180
            ? -cuPelX : X265_MAX(srchRngHorLeft, -cuPelX);
3181
        for (int x = 0 - roiWidth - puPelOffsetX; x >= boundX; --x)
3182
        {
3183
            if (!isValidIntraBCSearchArea(&cu, x, 0, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3184
            {
3185
                continue;
3186
            }
3187
3188
            sad = m_me.mvcost(MV(x, 0));
3189
3190
            refSrch = refY + x;
3191
            sad += m_me.bufSAD(refSrch, refStride);
3192
3193
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3194
            {
3195
                continue;
3196
            }
3197
3198
            intraBCSearchMVCandUpdate(sad, x, 0, sadBestCand, MVCand);
3199
            tempSadBest = sadBestCand[0];
3200
            if (sadBestCand[0] <= 3)
3201
            {
3202
                bestX = MVCand[0].x;
3203
                bestY = MVCand[0].y;
3204
                sadBest = sadBestCand[0];
3205
                mv.set(bestX, bestY);
3206
                cost = sadBest;
3207
3208
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3209
                return;
3210
            }
3211
        }
3212
3213
        bestX = MVCand[0].x;
3214
        bestY = MVCand[0].y;
3215
        sadBest = sadBestCand[0];
3216
3217
        if ((!bestX && !bestY) || (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 32))
3218
        {
3219
            //chroma refine
3220
            bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3221
            bestX = MVCand[bestCandIdx].x;
3222
            bestY = MVCand[bestCandIdx].y;
3223
            sadBest = sadBestCand[bestCandIdx];
3224
            mv.set(bestX, bestY);
3225
            cost = sadBest;
3226
3227
            updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3228
            return;
3229
        }
3230
3231
        if (cuGeom.depth > 2 && !bUse1DSearchFor8x8)
3232
        {
3233
            for (int y = X265_MAX(srchRngVerTop, -cuPelY); y <= srchRngVerBottom; y += 2)
3234
            {
3235
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3236
                {
3237
                    continue;
3238
                }
3239
3240
                int tempY = y + relCUPelY + roiHeight - 1;
3241
3242
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x++)
3243
                {
3244
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3245
                    {
3246
                        continue;
3247
                    }
3248
3249
                    int tempX = x + relCUPelX + roiWidth - 1;
3250
3251
                    if ((tempX >= 0) && (tempY >= 0))
3252
                    {
3253
                        int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3254
                        uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3255
                        if (iTempZscanIdx >= cu.m_absIdxInCTU)
3256
                        {
3257
                            continue;
3258
                        }
3259
                    }
3260
3261
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3262
                    {
3263
                        continue;
3264
                    }
3265
3266
                    sad = m_me.mvcost(MV(x, y));
3267
3268
                    refSrch = refY + y * refStride + x;
3269
                    sad += m_me.bufSAD(refSrch, refStride);
3270
3271
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3272
                }
3273
            }
3274
3275
            bestX = MVCand[0].x;
3276
            bestY = MVCand[0].y;
3277
            sadBest = sadBestCand[0];
3278
            if (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 16)
3279
            {
3280
                //chroma refine
3281
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3282
                bestX = MVCand[bestCandIdx].x;
3283
                bestY = MVCand[bestCandIdx].y;
3284
                sadBest = sadBestCand[bestCandIdx];
3285
                mv.set(bestX, bestY);
3286
                cost = sadBest;
3287
3288
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3289
                return;
3290
            }
3291
3292
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3293
            {
3294
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3295
                {
3296
                    continue;
3297
                }
3298
3299
                int tempY = y + relCUPelY + roiHeight - 1;
3300
3301
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x += 2)
3302
                {
3303
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3304
                    {
3305
                        continue;
3306
                    }
3307
3308
                    int tempX = x + relCUPelX + roiWidth - 1;
3309
3310
                    if ((tempX >= 0) && (tempY >= 0))
3311
                    {
3312
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3313
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3314
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3315
                        {
3316
                            continue;
3317
                        }
3318
                    }
3319
3320
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3321
                    {
3322
                        continue;
3323
                    }
3324
3325
                    sad = m_me.mvcost(MV(x, y));
3326
3327
                    refSrch = refY + y * refStride + x;
3328
                    sad += m_me.bufSAD(refSrch, refStride);
3329
3330
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3331
                    {
3332
                        continue;
3333
                    }
3334
3335
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3336
                    if (sadBestCand[0] <= 5)
3337
                    {
3338
                        //chroma refine & return
3339
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3340
                        bestX = MVCand[bestCandIdx].x;
3341
                        bestY = MVCand[bestCandIdx].y;
3342
                        sadBest = sadBestCand[bestCandIdx];
3343
                        mv.set(bestX, bestY);
3344
                        cost = sadBest;
3345
3346
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3347
                        return;
3348
                    }
3349
                }
3350
            }
3351
3352
            bestX = MVCand[0].x;
3353
            bestY = MVCand[0].y;
3354
            sadBest = sadBestCand[0];
3355
3356
            if ((sadBest >= tempSadBest) || ((sadBest - m_me.mvcost(MV(bestX, bestY))) <= 32))
3357
            {
3358
                //chroma refine
3359
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3360
                bestX = MVCand[bestCandIdx].x;
3361
                bestY = MVCand[bestCandIdx].y;
3362
                sadBest = sadBestCand[bestCandIdx];
3363
                mv.set(bestX, bestY);
3364
                cost = sadBest;
3365
3366
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3367
                return;
3368
            }
3369
3370
            tempSadBest = sadBestCand[0];
3371
3372
3373
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3374
            {
3375
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3376
                {
3377
                    continue;
3378
                }
3379
3380
                int tempY = y + relCUPelY + roiHeight - 1;
3381
3382
                for (int x = (X265_MAX(srchRngHorLeft, -cuPelX) + 1); x <= srchRngHorRight; x += 2)
3383
                {
3384
3385
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3386
                    {
3387
                        continue;
3388
                    }
3389
3390
                    int tempX = x + relCUPelX + roiWidth - 1;
3391
3392
                    if ((tempX >= 0) && (tempY >= 0))
3393
                    {
3394
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3395
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3396
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3397
                        {
3398
                            continue;
3399
                        }
3400
                    }
3401
3402
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3403
                    {
3404
                        continue;
3405
                    }
3406
3407
                    sad = m_me.mvcost(MV(x, y));
3408
3409
                    refSrch = refY + y * refStride + x;
3410
                    sad += m_me.bufSAD(refSrch, refStride);
3411
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3412
                    {
3413
                        continue;
3414
                    }
3415
3416
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3417
                    if (sadBestCand[0] <= 5)
3418
                    {
3419
                        //chroma refine & return
3420
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3421
                        bestX = MVCand[bestCandIdx].x;
3422
                        bestY = MVCand[bestCandIdx].y;
3423
                        sadBest = sadBestCand[bestCandIdx];
3424
                        mv.set(bestX, bestY);
3425
                        cost = sadBest;
3426
3427
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3428
                        return;
3429
                    }
3430
                }
3431
            }
3432
        }
3433
    }
3434
    else //full search
3435
    {
3436
        refY += (srchRngVerBottom * refStride);
3437
        int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3438
        int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3439
3440
        for (int y = srchRngVerBottom; y >= srchRngVerTop; y--)
3441
        {
3442
            if (((int)(cuPelY + y) < 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3443
            {
3444
                refY -= refStride;
3445
                continue;
3446
            }
3447
3448
            for (int x = srchRngHorLeft; x <= srchRngHorRight; x++)
3449
            {
3450
3451
                if (((int)(cuPelX + x) < 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3452
                {
3453
                    continue;
3454
                }
3455
3456
                int tempX = x + relCUPelX + roiWidth - 1;
3457
                int tempY = y + relCUPelY + roiHeight - 1;
3458
                if ((tempX >= 0) && (tempY >= 0))
3459
                {
3460
                    int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3461
                    uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3462
                    if (iTempZscanIdx >= cu.m_absIdxInCTU)
3463
                    {
3464
                        continue;
3465
                    }
3466
                }
3467
3468
                if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3469
                {
3470
                    continue;
3471
                }
3472
3473
                refSrch = refY + x;
3474
3475
                sad = m_me.bufSAD(refSrch, refStride);
3476
                sad += m_me.mvcost(MV(x, y));
3477
                if (sad < sadBest)
3478
                {
3479
                    sadBest = sad;
3480
                    bestX = x;
3481
                    bestY = y;
3482
                }
3483
                intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3484
            }
3485
3486
            refY -= refStride;
3487
        }
3488
    }
3489
3490
    bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3491
    bestX = MVCand[bestCandIdx].x;
3492
    bestY = MVCand[bestCandIdx].y;
3493
    sadBest = sadBestCand[bestCandIdx];
3494
    mv.set(bestX, bestY);
3495
    cost = sadBest;
3496
3497
    updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3498
3499
}
3500
3501
void Search::setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB)
3502
{
3503
    MV mvPred = pred;
3504
    CUData& cu = intraBCMode.cu;
3505
    cu.clipMv(mvPred);
3506
    int srLeft, srRight, srTop, srBottom;
3507
    int puWidth, puHeight;
3508
    uint32_t partAddr;
3509
3510
    cu.getPartIndexAndSize(puIdx, partAddr, puWidth, puHeight);
3511
3512
    const uint32_t lcuWidth = m_param->maxCUSize;
3513
    const uint32_t lcuHeight = m_param->maxCUSize;
3514
    const uint32_t cuPelX = cu.m_cuPelX + g_zscanToPelX[partAddr];
3515
    const uint32_t cuPelY = cu.m_cuPelY + g_zscanToPelY[partAddr];
3516
3517
    const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3518
    const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3519
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3520
    if (1 << cu.m_log2CUSize[0] == 16 && cu.m_partSize[0] == SIZE_2Nx2N && isFullFrameSearchrangeEnabled)// full frame search
3521
    {
3522
        srLeft = -1 * cuPelX;
3523
        srTop = -1 * cuPelY;
3524
3525
        srRight = picWidth - cuPelX - roiWidth;
3526
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3527
    }
3528
    else
3529
    {
3530
        const uint32_t searchWidthInCTUs = 1 << cu.m_log2CUSize[0] == 8 ? 1 : (isFullFrameSearchrangeEnabled) ? -1 : 1;
3531
        uint32_t width = 0, maxWidth = searchWidthInCTUs * lcuWidth;
3532
        for (const CUData* pTestCU = cu.m_cuLeft;
3533
            width < maxWidth && pTestCU != NULL && pTestCU->m_slice != NULL;
3534
            pTestCU = pTestCU->m_cuLeft, width += lcuWidth)
3535
        {
3536
        }
3537
        int maxXsr = (cuPelX % lcuWidth) + X265_MIN(maxWidth, width);
3538
        int maxYsr = cuPelY % lcuHeight;
3539
3540
        if (cu.m_chromaFormat == X265_CSP_I420 || cu.m_chromaFormat == X265_CSP_I422) maxXsr &= ~0x4;
3541
        if (cu.m_chromaFormat == X265_CSP_I420)                                       maxYsr &= ~0x4;
3542
3543
        srLeft = -maxXsr;
3544
        srTop = -maxYsr;
3545
3546
        srRight = lcuWidth - cuPelX % lcuWidth - roiWidth;
3547
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3548
    }
3549
3550
    if (cuPelX + srRight + roiWidth > picWidth)
3551
    {
3552
        srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3553
    }
3554
    if (cuPelY + srBottom + roiHeight > picHeight)
3555
    {
3556
        srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3557
    }
3558
3559
    searchRangeLT.x = srLeft;
3560
    searchRangeLT.y = srTop;
3561
    searchRangeRB.x = srRight;
3562
    searchRangeRB.y = srBottom;
3563
3564
    cu.clipMv(searchRangeLT);
3565
    cu.clipMv(searchRangeRB);
3566
3567
}
3568
3569
void Search::intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3570
{
3571
    uint32_t         partAddr;
3572
    int              roiWidth;
3573
    int              roiHeight;
3574
3575
    MV   searchRangeLT;
3576
    MV   searchRangeRB;
3577
    MV   mvPred = *pred;
3578
    const MV predictors = *pred;
3579
3580
    CUData& cu = intraBCMode.cu;
3581
    cu.getPartIndexAndSize(puIdx, partAddr, roiWidth, roiHeight);
3582
3583
    int ref = m_slice->m_numRefIdx[0] - 1;
3584
    pixel* refY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->getLumaAddr(cu.m_cuAddr, cu.m_absIdxInCTU + partAddr);
3585
    int  strideY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->m_stride;
3586
3587
    setIntraSearchRange(intraBCMode, mvPred, puIdx, roiWidth, roiHeight, searchRangeLT, searchRangeRB);
3588
3589
    m_me.setMVP(predictors);
3590
3591
    intraPatternSearch(intraBCMode, cuGeom, puIdx, partAddr, refY, strideY, &searchRangeLT, &searchRangeRB, mv, cost, roiWidth, roiHeight, testOnlyPred, bUse1DSearchFor8x8, ibc);
3592
}
3593
3594
bool Search::predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3595
{
3596
    MV zeroMv(0, 0);
3597
    CUData& cu = intraBCMode.cu;
3598
    Yuv* predYuv = &intraBCMode.predYuv;
3599
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3600
    int  numPart = cu.getNumPartInter(0);
3601
    int log2ParallelMergeLevelMinus2 = 0;
3602
3603
    // 12 mv candidates including lowresMV
3604
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
3605
3606
    if (m_param->bEnableSCC == 1 && (1 << cu.m_log2CUSize[0]) > SCM_S0067_MAX_CAND_SIZE) // fast search
3607
        return false;
3608
3609
    uint32_t totalCost = 0;
3610
    for (int puIdx = 0; puIdx < numPart; puIdx++)
3611
    {
3612
        int width, height;
3613
        uint32_t partAddr = 0;
3614
        MotionData* bestME = intraBCMode.bestME[puIdx];
3615
        PredictionUnit pu(cu, cuGeom, puIdx);
3616
        MV  mv, mvPred[2];
3617
        cu.getPartIndexAndSize(puIdx, pu.puAbsPartIdx, width, height);
3618
        partAddr = pu.puAbsPartIdx;
3619
        m_me.setSourcePU(*intraBCMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
3620
3621
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, intraBCMode.interNeighbours);
3622
        cu.getPMV(intraBCMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, puIdx, pu.puAbsPartIdx);
3623
3624
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
3625
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
3626
3627
        uint32_t cost;
3628
        mv.set(0, 0);
3629
        intraBlockCopyEstimate(intraBCMode, cuGeom, puIdx, mvPred, mv, cost, testOnlyPred, bUse1DSearchFor8x8, ibc);
3630
3631
        bestME->mv.set(mv.x << 2, mv.y << 2);
3632
        bestME->cost = cost;
3633
        totalCost += cost;
3634
        if (mv.x == 0 && mv.y == 0)
3635
        {
3636
            if (testOnlyPred)
3637
            {
3638
                m_lastCandCost = MAX_UINT;
3639
            }
3640
            return false;
3641
        }
3642
3643
        int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
3644
        int distAMVPBest, distMergeTemp;
3645
        int costAMVPBest, costMergeBest, costMergeTemp;
3646
        bitsAMVPBest = MAX_INT;
3647
        costAMVPBest = MAX_INT;
3648
        costMergeBest = MAX_INT;
3649
        int mvpIdxBest = 0;
3650
        int mvpIdxTemp;
3651
        int mrgIdxBest = -1;
3652
        int mrgIdxTemp = -1;
3653
        int xCUStart = cu.m_cuPelX;
3654
        int yCUStart = cu.m_cuPelY;
3655
        int xStartInCU = 0, yStartInCU = 0;
3656
        if (ePartSize == SIZE_2Nx2N)
3657
            xStartInCU = yStartInCU = 0;
3658
        else if (ePartSize == SIZE_2NxN)
3659
        {
3660
            xStartInCU = 0;
3661
            yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
3662
        }
3663
        else if (ePartSize == SIZE_Nx2N)
3664
        {
3665
            xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
3666
            yStartInCU = 0;
3667
        }
3668
        const pixel* currStart;
3669
        pixel* ref;
3670
        int currStride, refStride;
3671
        distAMVPBest = 0;
3672
3673
        MV cMvQuaterPixl = mv;
3674
        cMvQuaterPixl <<= 2;
3675
        cu.setPUMv(0, cMvQuaterPixl, pu.puAbsPartIdx, puIdx);
3676
        cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
3677
        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
3678
        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3679
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3680
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
3681
        for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
3682
        {
3683
            int tempHeight, tempWidth;
3684
            if (ch == 0)
3685
            {
3686
                tempHeight = height;
3687
                tempWidth = width;
3688
                ref = tmpPredYuv.getLumaAddr(partAddr);
3689
                refStride = tmpPredYuv.m_size;
3690
                distAMVPBest += m_me.bufSAD(ref, refStride);
3691
            }
3692
            else
3693
            {
3694
                tempHeight = height >> m_vChromaShift;
3695
                tempWidth = width >> m_hChromaShift;
3696
3697
                currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
3698
                currStride = intraBCMode.fencYuv->m_csize;
3699
                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
3700
                refStride = tmpPredYuv.m_csize;
3701
                distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
3702
            }
3703
        }
3704
3705
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
3706
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
3707
3708
        for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
3709
        {
3710
            m_me.setMVP(mvPred[mvpIdxTemp]);
3711
            bitsAMVPTemp = m_me.bitcost(mv, mvPred[mvpIdxTemp]);
3712
            if (bitsAMVPTemp < bitsAMVPBest)
3713
            {
3714
                bitsAMVPBest = bitsAMVPTemp;
3715
                mvpIdxBest = mvpIdxTemp;
3716
            }
3717
        }
3718
3719
        bitsAMVPBest++; // for MVP Index bits
3720
        costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
3721
3722
        MVField cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3723
        uint8_t uhInterDirNeighbours[MRG_MAX_NUM_CANDS];
3724
        int numValidMergeCand = 0;
3725
3726
        for (int i = 0; i < MRG_MAX_NUM_CANDS; i++)
3727
        {
3728
            cMvFieldNeighbours[i][0].mv.set(0, 0);
3729
            cMvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
3730
        }
3731
3732
        if (ePartSize != SIZE_2Nx2N)
3733
        {
3734
            if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
3735
            {
3736
                cu.setPartSizeSubParts(SIZE_2Nx2N);
3737
                if (puIdx == 0)
3738
                {
3739
                    numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, uhInterDirNeighbours);
3740
                }
3741
                cu.setPartSizeSubParts(ePartSize);
3742
            }
3743
            else
3744
            {
3745
                numValidMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, cMvFieldNeighbours, uhInterDirNeighbours);
3746
            }
3747
3748
            cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand);
3749
            restrictBipredMergeCand(&cu, puIdx, cMvFieldNeighbours, uhInterDirNeighbours, numValidMergeCand);
3750
3751
            for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCand; mrgIdxTemp++)
3752
            {
3753
                if (uhInterDirNeighbours[mrgIdxTemp] != 1)
3754
                {
3755
                    continue;
3756
                }
3757
                if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
3758
                {
3759
                    continue;
3760
                }
3761
3762
                if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, width, height, &cu,
3763
                    xStartInCU, yStartInCU, (cMvFieldNeighbours[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighbours[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
3764
                {
3765
                    continue;
3766
                }
3767
                bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
3768
3769
                distMergeTemp = 0;
3770
3771
                cu.setPUMv(0, cMvFieldNeighbours[mrgIdxTemp][0].mv, pu.puAbsPartIdx, puIdx);
3772
                cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, puIdx);
3773
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
3774
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3775
                cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3776
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
3777
3778
                for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
3779
                {
3780
                    int tempHeight, tempWidth;
3781
                    if (ch == 0)
3782
                    {
3783
                        tempHeight = height;
3784
                        tempWidth = width;
3785
                        ref = tmpPredYuv.getLumaAddr(partAddr);
3786
                        refStride = tmpPredYuv.m_size;
3787
                        distMergeTemp += m_me.bufSAD(ref, refStride);
3788
                    }
3789
                    else
3790
                    {
3791
                        tempHeight = height >> m_vChromaShift;
3792
                        tempWidth = width >> m_hChromaShift;
3793
3794
                        currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
3795
                        currStride = intraBCMode.fencYuv->m_csize;
3796
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
3797
                        refStride = tmpPredYuv.m_csize;
3798
                        distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
3799
                    }
3800
                }
3801
                costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
3802
3803
                if (costMergeTemp < costMergeBest)
3804
                {
3805
                    costMergeBest = costMergeTemp;
3806
                    mrgIdxBest = mrgIdxTemp;
3807
                }
3808
            }
3809
        }
3810
        if (costAMVPBest < costMergeBest)
3811
        {
3812
            MV tempmv((mv.x << 2), (mv.y << 2));
3813
            MVField mvField[2];
3814
            mvField[0].mv = tempmv;
3815
            mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
3816
            mvField[1].mv = zeroMv;
3817
            mvField[1].refIdx = REF_NOT_VALID;
3818
3819
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3820
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
3821
3822
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
3823
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
3824
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
3825
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
3826
3827
            MV mvd;
3828
            mvd.set(mv.x - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].x >> 2), mv.y - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].y >> 2));
3829
3830
            cu.m_mvd[0][pu.puAbsPartIdx] = mvd;
3831
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mvpIdxBest;
3832
        }
3833
        else
3834
        {
3835
            MV MV(cMvFieldNeighbours[mrgIdxBest][0].mv.x, cMvFieldNeighbours[mrgIdxBest][0].mv.y);
3836
            MVField mvField[2];
3837
            mvField[0].mv = MV;
3838
            mvField[0].refIdx = cu.m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
3839
            mvField[1].mv = zeroMv;
3840
            mvField[1].refIdx = REF_NOT_VALID;
3841
3842
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
3843
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mrgIdxBest; /* merge candidate ID is stored in L0 MVP idx */
3844
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
3845
3846
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
3847
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
3848
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
3849
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
3850
3851
            cu.m_mvd[0][pu.puAbsPartIdx] = zeroMv;
3852
            cu.m_mvd[1][pu.puAbsPartIdx] = zeroMv;
3853
        }
3854
        motionCompensation(cu, pu, *predYuv, 1, 1);
3855
    }
3856
3857
    PredictionUnit pu(cu, cuGeom, 0);
3858
    uint32_t abortThreshold = (1 << cu.m_log2CUSize[0]) * (1 << cu.m_log2CUSize[0]) * 2;
3859
    if (testOnlyPred)
3860
    {
3861
        if (numPart == 1 && totalCost > abortThreshold)
3862
        {
3863
            m_lastCandCost = MAX_UINT;
3864
            return false;
3865
        }
3866
        m_lastCandCost = totalCost;
3867
    }
3868
    else if (totalCost < abortThreshold && 3 * totalCost >> 2 >= m_lastCandCost)
3869
    {
3870
        return false;
3871
    }
3872
    return true;
3873
}
3874
3875
bool Search::predMixedIntraBCInterSearch(Mode& intraBCMixedMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMvCandList)
3876
{
3877
    intraBCMixedMode.initCosts();
3878
    intraBCMixedMode.cu.setPartSizeSubParts(ePartSize);
3879
    intraBCMixedMode.cu.setPredModeSubParts(MODE_INTER);
3880
    CUData& cu = intraBCMixedMode.cu;
3881
    int numComb = 2;
3882
    int numPart = 2;
3883
    uint32_t cost[2] = { 0,0 };
3884
    uint32_t maxCost = UINT32_MAX;
3885
3886
    int      numPredDir = m_slice->isInterP() ? 1 : 2;
3887
    MV       cMvZero(0, 0);
3888
3889
    MV  cMvPredCand[2][2];
3890
    int IBCValidFlag = 0;
3891
    int bestIBCMvpIdx[2] = { 0, 0 };
3892
    int bestInterMvpIdx[2] = { 0, 0 };
3893
    int bestInterDir[2] = { 0, 0 };
3894
    int bestRefIdx[2] = { 0, 0 };
3895
    bool isMergeMode[2] = { false, false };
3896
    bool isIBCMergeMode[2] = { false, false };
3897
    MVField cMRGMvField[2][2];
3898
    MVField cMRGMvFieldIBC[2][2];
3899
    int log2ParallelMergeLevelMinus2 = 0;
3900
    // 12 mv candidates including lowresMV
3901
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
3902
3903
    Yuv* predYuv = &intraBCMixedMode.predYuv;
3904
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3905
3906
    for (int combo = 0; combo < numComb; combo++) // number of combination
3907
    {
3908
        for (int partIdx = 0; partIdx < numPart; ++partIdx)
3909
        {
3910
            int dummyWidth, dummyHeight;
3911
            uint32_t partAddr = 0;
3912
            PredictionUnit pu(cu, cuGeom, partIdx);
3913
            cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth, dummyHeight);
3914
            m_me.setSourcePU(*intraBCMixedMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
3915
3916
            MV mvPred[2];
3917
            MV bvPred[2];
3918
            if ((combo == 0 && partIdx == 0) || (combo == 1 && partIdx == 1)) // intraBC
3919
            {
3920
                MV cMv = iMvCandList[8 + partIdx];
3921
                if (cMv.x == 0 && cMv.y == 0)
3922
                {
3923
                    cost[combo] = maxCost;
3924
                    IBCValidFlag++;
3925
                    break;
3926
                }
3927
3928
                cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
3929
                cu.getPMV(intraBCMixedMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, partIdx, pu.puAbsPartIdx);
3930
3931
                bvPred[0] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0];
3932
                bvPred[1] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1];
3933
                bvPred[0] >>= 2;
3934
                bvPred[1] >>= 2;
3935
3936
                /////////////////////////////////////////////////////////////
3937
                // ibc merge
3938
                // choose one MVP and compare with merge mode
3939
3940
                int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
3941
                int distAMVPBest, distMergeTemp;
3942
                int costAMVPBest, costMergeBest, costMergeTemp;
3943
                bitsAMVPBest = MAX_INT;
3944
                costAMVPBest = MAX_INT;
3945
                costMergeBest = MAX_INT;
3946
                int mvpIdxBest = 0;
3947
                int mvpIdxTemp;
3948
                int mrgIdxBest = -1;
3949
                int mrgIdxTemp = -1;
3950
                int xCUStart = cu.m_cuPelX;
3951
                int yCUStart = cu.m_cuPelY;
3952
                int xStartInCU = 0, yStartInCU = 0;
3953
                if (ePartSize == SIZE_2Nx2N)
3954
                    xStartInCU = yStartInCU = 0;
3955
                else if (ePartSize == SIZE_2NxN)
3956
                {
3957
                    xStartInCU = 0;
3958
                    yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
3959
                }
3960
                else if (ePartSize == SIZE_Nx2N)
3961
                {
3962
                    xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
3963
                    yStartInCU = 0;
3964
                }
3965
                const pixel* currStart;
3966
                int currStride;
3967
                int refStride;
3968
                distAMVPBest = 0;
3969
                pixel* ref;
3970
3971
                cu.setPUMv(0, cMv, pu.puAbsPartIdx, partIdx);
3972
                cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, partIdx);
3973
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
3974
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
3975
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
3976
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
3977
3978
                for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
3979
                {
3980
                    int tempHeight, tempWidth;
3981
                    if (ch == 0)
3982
                    {
3983
                        tempHeight = dummyHeight;
3984
                        tempWidth = dummyWidth;
3985
                        ref = tmpPredYuv.getLumaAddr(partAddr);
3986
                        refStride = tmpPredYuv.m_size;
3987
                        distAMVPBest += m_me.bufSAD(ref, refStride);
3988
                    }
3989
                    else
3990
                    {
3991
                        tempHeight = dummyHeight >> m_vChromaShift;
3992
                        tempWidth = dummyWidth >> m_hChromaShift;
3993
3994
                        currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
3995
                        currStride = intraBCMixedMode.fencYuv->m_csize;
3996
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
3997
                        refStride = tmpPredYuv.m_csize;
3998
                        distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
3999
                    }
4000
                }
4001
4002
                MV check;
4003
                for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4004
                {
4005
                    m_me.setMVP(bvPred[mvpIdxTemp]);
4006
                    bitsAMVPTemp = m_me.bitcost(cMv >> 2, bvPred[mvpIdxTemp]);
4007
                    if (bitsAMVPTemp < bitsAMVPBest)
4008
                    {
4009
                        bitsAMVPBest = bitsAMVPTemp;
4010
                        mvpIdxBest = mvpIdxTemp;
4011
                    }
4012
                }
4013
4014
                bitsAMVPBest++; // for MVP Index bits
4015
                costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4016
4017
                MVField cMvFieldNeighboursIBC[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4018
                uint8_t uhInterDirNeighboursIBC[MRG_MAX_NUM_CANDS];
4019
                int numValidMergeCandIBC = 0;
4020
4021
                if (ePartSize != SIZE_2Nx2N)
4022
                {
4023
                    if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4024
                    {
4025
                        cu.setPartSizeSubParts(SIZE_2Nx2N);
4026
                        if (partIdx == 0)
4027
                        {
4028
                            numValidMergeCandIBC = cu.getInterMergeCandidates(0, 0, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4029
                        }
4030
                        cu.setPartSizeSubParts(ePartSize);
4031
                    }
4032
                    else
4033
                    {
4034
                        numValidMergeCandIBC = cu.getInterMergeCandidates(pu.puAbsPartIdx, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4035
                    }
4036
4037
                    cu.roundMergeCandidates(cMvFieldNeighboursIBC, numValidMergeCandIBC);
4038
                    restrictBipredMergeCand(&cu, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC, numValidMergeCandIBC);
4039
4040
                    for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCandIBC; mrgIdxTemp++)
4041
                    {
4042
                        if (uhInterDirNeighboursIBC[mrgIdxTemp] != 1)
4043
                        {
4044
                            continue;
4045
                        }
4046
                        if (m_slice->m_refPOCList[0][cMvFieldNeighboursIBC[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4047
                        {
4048
                            continue;
4049
                        }
4050
4051
                        if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, dummyWidth, dummyHeight, &cu,
4052
                            xStartInCU, yStartInCU, (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4053
                        {
4054
                            continue;
4055
                        }
4056
                        bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4057
4058
                        distMergeTemp = 0;
4059
                        cu.setPUMv(0, cMvFieldNeighboursIBC[mrgIdxTemp][0].mv, pu.puAbsPartIdx, partIdx);
4060
                        cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4061
                        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4062
                        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4063
                        cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4064
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4065
4066
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4067
                        {
4068
                            int tempHeight, tempWidth;
4069
                            if (ch == 0)
4070
                            {
4071
                                tempHeight = dummyHeight;
4072
                                tempWidth = dummyWidth;
4073
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4074
                                refStride = tmpPredYuv.m_size;
4075
                                distMergeTemp += m_me.bufSAD(ref, refStride);
4076
                            }
4077
                            else
4078
                            {
4079
                                tempHeight = dummyHeight >> m_vChromaShift;
4080
                                tempWidth = dummyWidth >> m_hChromaShift;
4081
4082
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4083
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4084
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4085
                                refStride = tmpPredYuv.m_csize;
4086
                                distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4087
                            }
4088
                        }
4089
                        costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4090
4091
                        if (costMergeTemp < costMergeBest)
4092
                        {
4093
                            costMergeBest = costMergeTemp;
4094
                            mrgIdxBest = mrgIdxTemp;
4095
                        }
4096
                    }
4097
                }
4098
4099
                if (costMergeBest < costAMVPBest)
4100
                {
4101
                    cost[combo] += costMergeBest;
4102
                    isIBCMergeMode[combo] = true;
4103
                    bestIBCMvpIdx[combo] = mrgIdxBest;
4104
4105
                    MVField mvField[2];
4106
                    MV mv(cMvFieldNeighboursIBC[mrgIdxBest][0].mv.x, cMvFieldNeighboursIBC[mrgIdxBest][0].mv.y);
4107
                    mvField[0].mv = mv;
4108
                    mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4109
                    mvField[1].mv = cMvZero;
4110
                    mvField[1].refIdx = REF_NOT_VALID;
4111
                    cMRGMvFieldIBC[combo][0] = mvField[0];
4112
                    cMRGMvFieldIBC[combo][1] = mvField[1];
4113
                }
4114
                else
4115
                {
4116
                    cost[combo] += costAMVPBest;
4117
                    isIBCMergeMode[combo] = false;
4118
                    bestIBCMvpIdx[combo] = mvpIdxBest;
4119
                    cMvPredCand[combo][partIdx].set(bvPred[mvpIdxBest].x << 2, bvPred[mvpIdxBest].y << 2);
4120
                }
4121
4122
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);  // list 0 prediction
4123
                if (isIBCMergeMode[combo])
4124
                {
4125
                    cu.setPUMv(0, cMRGMvFieldIBC[combo][0].mv, pu.puAbsPartIdx, partIdx);
4126
                }
4127
                else
4128
                {
4129
                    cu.setPUMv(0, iMvCandList[8 + partIdx], pu.puAbsPartIdx, partIdx);
4130
                    cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4131
                    cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4132
                }
4133
                // ibc merge
4134
                /////////////////////////////////////////////////////////////
4135
            }
4136
            else // is inter PU
4137
            {
4138
                uint32_t  costInterTemp = 0;
4139
                uint32_t  costInterBest = UINT32_MAX;
4140
                const pixel* currStart;
4141
                int currStride;
4142
                pixel* ref;
4143
                int refStride;
4144
                MergeData merge;
4145
                memset(&merge, 0, sizeof(merge));
4146
                for (int refList = 0; refList < numPredDir; refList++)
4147
                {
4148
                    uint32_t numRef = refList ? ((m_slice->m_numRefIdx[1] > 1) ? 2 : 1) : ((m_slice->m_numRefIdx[0] - 1 > 1) ? 2 : 1);
4149
                    for (uint32_t refIdx = 0; refIdx < numRef; refIdx++)
4150
                    {
4151
                        MV cMv = iMvCandList[4 * refList + 2 * refIdx + partIdx];
4152
4153
                        cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4154
                        cu.getPMV(intraBCMixedMode.interNeighbours, refList, refIdx, intraBCMixedMode.amvpCand[refList][refIdx], mvc, partIdx, pu.puAbsPartIdx);
4155
                        int mvpIdx;
4156
4157
                        uint32_t  tempCost0 = 0;
4158
                        uint32_t  tempCost1 = 0;
4159
                        mvPred[0] = intraBCMixedMode.amvpCand[refList][refIdx][0];
4160
                        mvPred[1] = intraBCMixedMode.amvpCand[refList][refIdx][1];
4161
4162
                        m_me.setMVP(mvPred[0]);
4163
                        tempCost0 = m_me.bitcost(cMv, mvPred[0]);
4164
                        m_me.setMVP(mvPred[1]);
4165
                        tempCost1 = m_me.bitcost(cMv, mvPred[1]);
4166
                        if (tempCost1 < tempCost0)
4167
                        {
4168
                            mvpIdx = 1;
4169
                        }
4170
                        else
4171
                        {
4172
                            mvpIdx = 0;
4173
                        }
4174
                        uint32_t bitsTemp = m_listSelBits[refList] + MVP_IDX_BITS;
4175
                        bitsTemp += getTUBits(refIdx, numRef);
4176
4177
                        m_me.setMVP(mvPred[mvpIdx]);
4178
                        if (cu.m_slice->m_useIntegerMv)
4179
                        {
4180
                            cu.setPUMv(refList, (cMv >> 2) << 2, pu.puAbsPartIdx, partIdx);
4181
                        }
4182
                        else
4183
                        {
4184
                            cu.setPUMv(refList, cMv, pu.puAbsPartIdx, partIdx);
4185
                        }
4186
                        cu.setPURefIdx(refList, refIdx, pu.puAbsPartIdx, partIdx);
4187
                        cu.setPUInterDir(1 + refList, pu.puAbsPartIdx, partIdx);
4188
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4189
4190
                        costInterTemp = 0;
4191
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4192
                        {
4193
                            int tempHeight, tempWidth;
4194
                            if (ch == 0)
4195
                            {
4196
                                tempHeight = dummyHeight;
4197
                                tempWidth = dummyWidth;
4198
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4199
                                refStride = tmpPredYuv.m_size;
4200
                                costInterTemp += m_me.bufSAD(ref, refStride);
4201
                            }
4202
                            else
4203
                            {
4204
                                tempHeight = dummyHeight >> m_vChromaShift;
4205
                                tempWidth = dummyWidth >> m_hChromaShift;
4206
4207
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4208
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4209
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4210
                                refStride = tmpPredYuv.m_csize;
4211
                                costInterTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4212
                            }
4213
4214
                            if (costInterTemp >= costInterBest)
4215
                            {
4216
                                break;
4217
                            }
4218
                        }
4219
                        cu.setPURefIdx(refList, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4220
4221
                        costInterTemp += m_me.bitcost(cMv, mvPred[mvpIdx]);
4222
                        costInterTemp += m_rdCost.getCost(bitsTemp);
4223
4224
                        if (costInterTemp < costInterBest)
4225
                        {
4226
                            costInterBest = costInterTemp;
4227
                            bestInterMvpIdx[combo] = mvpIdx;
4228
                            bestInterDir[combo] = refList;
4229
                            bestRefIdx[combo] = refIdx;
4230
                            cMvPredCand[combo][partIdx] = mvPred[mvpIdx];
4231
                        }
4232
                    }
4233
                } // end RefIdx and RefList search
4234
4235
                uint32_t MRGInterDir = 0;
4236
                uint32_t MRGIndex = 0;
4237
4238
                // find Merge result
4239
                uint32_t MRGCost = UINT32_MAX;
4240
                cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4241
4242
                mergeEstimation(cu, cuGeom, pu, partIdx, merge);
4243
                MRGInterDir = merge.dir;
4244
                cMRGMvField[combo][0] = merge.mvField[0];
4245
                cMRGMvField[combo][1] = merge.mvField[1];
4246
                MRGIndex = merge.index;
4247
                cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4248
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4249
4250
                if (MRGCost < costInterBest)
4251
                {
4252
                    costInterBest = MRGCost;
4253
                    isMergeMode[combo] = true;
4254
                    bestInterMvpIdx[combo] = MRGIndex;
4255
                    bestInterDir[combo] = MRGInterDir;
4256
                }
4257
4258
                cost[combo] += costInterBest;
4259
                if (isMergeMode[combo])
4260
                {
4261
                    cu.setPUInterDir(bestInterDir[combo], pu.puAbsPartIdx, partIdx);
4262
                    cu.setPUMv(0, cMRGMvField[combo][0].mv, pu.puAbsPartIdx, partIdx);
4263
                    cu.setPURefIdx(0, cMRGMvField[combo][0].refIdx, pu.puAbsPartIdx, partIdx);
4264
                    cu.setPUMv(1, cMRGMvField[combo][1].mv, pu.puAbsPartIdx, partIdx);
4265
                    cu.setPURefIdx(1, cMRGMvField[combo][1].refIdx, pu.puAbsPartIdx, partIdx);
4266
                }
4267
                else
4268
                {
4269
                    int refListOpt = bestInterDir[combo];
4270
                    int refIdxOpt = bestRefIdx[combo];
4271
                    if (cu.m_slice->m_useIntegerMv)
4272
                    {
4273
                        cu.setPUMv(refListOpt, (iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, pu.puAbsPartIdx, partIdx);
4274
                    }
4275
                    else
4276
                    {
4277
                        cu.setPUMv(refListOpt, iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt], pu.puAbsPartIdx, partIdx);
4278
                    }
4279
                    cu.setPURefIdx(refListOpt, refIdxOpt, pu.puAbsPartIdx, partIdx);
4280
                    cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4281
                    cu.setPUInterDir(1 + refListOpt, pu.puAbsPartIdx, partIdx);
4282
                    cu.m_mvpIdx[refListOpt][pu.puAbsPartIdx] = bestInterMvpIdx[combo];
4283
                }
4284
            }
4285
        } // for ipartIdx
4286
    } // for combo
4287
4288
    if (IBCValidFlag > 1)
4289
    {
4290
        return false;
4291
    }
4292
4293
    MV cMvd;
4294
    MV cMVFinal;
4295
    if (cost[0] <= cost[1])
4296
    {
4297
        int iDummyWidth1, iDummyHeight1;
4298
        uint32_t partAddr = 0;
4299
        uint32_t partIdx = 0;
4300
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4301
4302
        if (isIBCMergeMode[0])
4303
        {
4304
            cu.m_mergeFlag[partAddr] = true;
4305
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4306
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4307
            cu.setPUMv(0, cMRGMvFieldIBC[0][0].mv, partAddr, partIdx);
4308
            cu.setPURefIdx(0, cMRGMvFieldIBC[0][0].refIdx, partAddr, partIdx);
4309
            cu.setPUMv(1, cMRGMvFieldIBC[0][1].mv, partAddr, partIdx);
4310
            cu.setPURefIdx(1, cMRGMvFieldIBC[0][1].refIdx, partAddr, partIdx);
4311
4312
            cu.m_mvd[0][partAddr] = cMvZero;
4313
            cu.m_mvd[1][partAddr] = cMvZero;
4314
        }
4315
        else
4316
        {
4317
            cu.m_mergeFlag[partAddr] = false;
4318
4319
            cMvd.set((iMvCandList[8].x - cMvPredCand[0][0].x) >> 2, (iMvCandList[8].y - cMvPredCand[0][0].y) >> 2);
4320
            cu.setPUMv(0, iMvCandList[8], partAddr, partIdx);
4321
            cu.m_mvd[0][partAddr] = cMvd;
4322
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4323
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4324
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4325
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4326
        }
4327
4328
        partIdx = 1;
4329
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4330
4331
        if (isMergeMode[0])
4332
        {
4333
            cu.m_mergeFlag[partAddr] = true;
4334
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[0];
4335
            cu.setPUInterDir(bestInterDir[0], partAddr, partIdx);  // list 0 prediction
4336
            cu.setPUMv(0, cMRGMvField[0][0].mv, partAddr, partIdx);
4337
            cu.setPURefIdx(0, cMRGMvField[0][0].refIdx, partAddr, partIdx);
4338
            cu.setPUMv(1, cMRGMvField[0][1].mv, partAddr, partIdx);
4339
            cu.setPURefIdx(1, cMRGMvField[0][1].refIdx, partAddr, partIdx);
4340
4341
            cu.m_mvd[0][partAddr] = cMvZero;
4342
            cu.m_mvd[1][partAddr] = cMvZero;
4343
        }
4344
        else
4345
        {
4346
            int refListOpt = bestInterDir[0];
4347
            int refIdxOpt = bestRefIdx[0];
4348
            if (cu.m_slice->m_useIntegerMv)
4349
            {
4350
                cMvd.set(((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[0][1].x >> 2)), ((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[0][1].y >> 2)));
4351
                cu.setPUMv(refListOpt, (iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4352
            }
4353
            else
4354
            {
4355
                cMvd.set(iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[0][1].x, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[0][1].y);
4356
                cu.setPUMv(refListOpt, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4357
            }
4358
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4359
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4360
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4361
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4362
            cu.m_mergeFlag[partAddr] = false;
4363
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[0];
4364
        }
4365
    }
4366
    else
4367
    {
4368
        int dummyWidth2, dummyHeight2;
4369
        uint32_t partAddr = 0;
4370
        uint32_t partIdx = 0;
4371
4372
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4373
4374
        if (isMergeMode[1])
4375
        {
4376
            cu.m_mergeFlag[partAddr] = true;
4377
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[1];
4378
            cu.setPUInterDir(bestInterDir[1], partAddr, partIdx);  // list 0 prediction
4379
            cu.setPUMv(0, cMRGMvField[1][0].mv, partAddr, partIdx);
4380
            cu.setPURefIdx(0, cMRGMvField[1][0].refIdx, partAddr, partIdx);
4381
            cu.setPUMv(1, cMRGMvField[1][1].mv, partAddr, partIdx);
4382
            cu.setPURefIdx(1, cMRGMvField[1][1].refIdx, partAddr, partIdx);
4383
4384
            cu.m_mvd[0][partAddr] = cMvZero;
4385
            cu.m_mvd[1][partAddr] = cMvZero;
4386
        }
4387
        else
4388
        {
4389
            int refListOpt = bestInterDir[1];
4390
            int refIdxOpt = bestRefIdx[1];
4391
            if (cu.m_slice->m_useIntegerMv)
4392
            {
4393
                cMvd.set((iMvCandList[2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[1][0].x >> 2), (iMvCandList[2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[1][0].y >> 2));
4394
                cu.setPUMv(refListOpt, (iMvCandList[2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4395
            }
4396
            else
4397
            {
4398
                cMvd.set(iMvCandList[2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[1][0].x, iMvCandList[2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[1][0].y);
4399
                cu.setPUMv(refListOpt, iMvCandList[2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4400
            }
4401
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4402
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4403
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4404
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4405
            cu.m_mergeFlag[partAddr] = false;
4406
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[1];
4407
        }
4408
4409
        partIdx = 1;
4410
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4411
4412
        if (isIBCMergeMode[1])
4413
        {
4414
            cu.m_mergeFlag[partAddr] = true;
4415
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4416
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4417
            cu.setPUMv(0, cMRGMvFieldIBC[1][0].mv, partAddr, partIdx);
4418
            cu.setPURefIdx(0, cMRGMvFieldIBC[1][0].refIdx, partAddr, partIdx);
4419
            cu.setPUMv(1, cMRGMvFieldIBC[1][1].mv, partAddr, partIdx);
4420
            cu.setPURefIdx(1, cMRGMvFieldIBC[1][1].refIdx, partAddr, partIdx);
4421
4422
            cu.m_mvd[0][partAddr] = cMvZero;
4423
            cu.m_mvd[1][partAddr] = cMvZero;
4424
        }
4425
        else
4426
        {
4427
            cu.m_mergeFlag[partAddr] = false;
4428
4429
            cMvd.set(((iMvCandList[9].x - cMvPredCand[1][1].x) >> 2), (iMvCandList[9].y - cMvPredCand[1][1].y) >> 2);
4430
            cu.setPUMv(0, iMvCandList[9], partAddr, partIdx);
4431
            cu.m_mvd[0][partAddr] = cMvd;
4432
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4433
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4434
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4435
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4436
        }
4437
    }
4438
    for (int partIdx = 0; partIdx < numPart; ++partIdx)
4439
    {
4440
        PredictionUnit pu(cu, cuGeom, partIdx);
4441
        motionCompensation(cu, pu, *predYuv, 1, 1);
4442
    }
4443
4444
    return true;
4445
}
4446
#endif
4447
4448
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
4449
0
{
4450
0
    if (cuMode == SIZE_2Nx2N)
4451
0
    {
4452
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4453
0
        blockBit[1] = 3;
4454
0
        blockBit[2] = 5;
4455
0
    }
4456
0
    else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
4457
0
    {
4458
0
        static const uint32_t listBits[2][3][3] =
4459
0
        {
4460
0
            { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4461
0
            { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
4462
0
        };
4463
0
        if (bPSlice)
4464
0
        {
4465
0
            blockBit[0] = 3;
4466
0
            blockBit[1] = 0;
4467
0
            blockBit[2] = 0;
4468
0
        }
4469
0
        else
4470
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4471
0
    }
4472
0
    else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
4473
0
    {
4474
0
        static const uint32_t listBits[2][3][3] =
4475
0
        {
4476
0
            { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4477
0
            { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
4478
0
        };
4479
0
        if (bPSlice)
4480
0
        {
4481
0
            blockBit[0] = 3;
4482
0
            blockBit[1] = 0;
4483
0
            blockBit[2] = 0;
4484
0
        }
4485
0
        else
4486
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4487
0
    }
4488
0
    else if (cuMode == SIZE_NxN)
4489
0
    {
4490
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4491
0
        blockBit[1] = 3;
4492
0
        blockBit[2] = 5;
4493
0
    }
4494
0
    else
4495
0
    {
4496
0
        X265_CHECK(0, "getBlkBits: unknown cuMode\n");
4497
0
    }
4498
0
}
4499
4500
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
4501
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
4502
0
{
4503
0
    int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
4504
0
    if (diffBits < 0)
4505
0
    {
4506
0
        mvpIdx = !mvpIdx;
4507
0
        uint32_t origOutBits = outBits;
4508
0
        outBits = origOutBits + diffBits;
4509
0
        outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4510
0
    }
4511
0
    return amvpCand[mvpIdx];
4512
0
}
4513
4514
/* Update to default MVP when using an alternative mvp */
4515
void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
4516
0
{
4517
0
    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
4518
0
    uint32_t origOutBits = outBits;
4519
0
    outBits = origOutBits + diffBits;
4520
0
    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4521
0
}
4522
4523
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
4524
0
{
4525
0
    MV dist((int32_t)merange << 2, (int32_t)merange << 2);
4526
0
    mvmin = mvp - dist;
4527
0
    mvmax = mvp + dist;
4528
4529
0
    if (m_vertRestriction)
4530
0
    {
4531
0
        int mvRestricted = (56 - 1) << 2; // -1 to consider subpel search
4532
0
        if (mvmax.y >= mvRestricted)
4533
0
        {
4534
0
            mvmax.y = mvRestricted; //only positive side is restricted
4535
0
        }
4536
0
    }
4537
4538
0
    cu.clipMv(mvmin);
4539
0
    cu.clipMv(mvmax);
4540
4541
0
    if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
4542
0
          cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
4543
0
          m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
4544
0
    {
4545
0
        int safeX, maxSafeMv;
4546
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
4547
0
        maxSafeMv = (safeX - cu.m_cuPelX) * 4;
4548
0
        mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
4549
0
        mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
4550
0
    }
4551
4552
    // apply restrict on slices
4553
0
    if ((m_param->maxSlices > 1) & m_bFrameParallel)
4554
0
    {
4555
0
        mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
4556
0
        mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
4557
0
    }
4558
4559
    /* Clip search range to signaled maximum MV length.
4560
     * We do not support this VUI field being changed from the default */
4561
0
    const int maxMvLen = (1 << 15) - 1;
4562
0
    mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
4563
0
    mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
4564
0
    mvmax.x = X265_MIN(mvmax.x, maxMvLen);
4565
0
    mvmax.y = X265_MIN(mvmax.y, maxMvLen);
4566
4567
0
    mvmin >>= 2;
4568
0
    mvmax >>= 2;
4569
4570
    /* conditional clipping for frame parallelism */
4571
0
    mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels);
4572
0
    mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels);
4573
4574
    /* conditional clipping for negative mv range */
4575
0
    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
4576
0
}
4577
4578
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
4579
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
4580
0
{
4581
0
    CUData& cu = interMode.cu;
4582
0
    Yuv* reconYuv = &interMode.reconYuv;
4583
0
    const Yuv* fencYuv = interMode.fencYuv;
4584
0
    Yuv* predYuv = &interMode.predYuv;
4585
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
4586
0
    uint32_t depth  = cu.m_cuDepth[0];
4587
4588
    // No residual coding : SKIP mode
4589
4590
0
    cu.setPredModeSubParts(MODE_SKIP);
4591
0
    cu.clearCbf();
4592
0
    cu.setTUDepthSubParts(0, 0, depth);
4593
4594
0
    reconYuv->copyFromYuv(interMode.predYuv);
4595
4596
    // Luma
4597
0
    int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
4598
0
    interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
4599
0
    interMode.distortion = interMode.lumaDistortion;
4600
    // Chroma
4601
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4602
0
    {
4603
0
        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
4604
0
        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
4605
0
        interMode.distortion += interMode.chromaDistortion;
4606
0
    }
4607
0
    cu.m_distortion[0] = interMode.distortion;
4608
0
    m_entropyCoder.load(m_rqt[depth].cur);
4609
0
    m_entropyCoder.resetBits();
4610
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
4611
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
4612
0
    m_entropyCoder.codeSkipFlag(cu, 0);
4613
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
4614
0
    m_entropyCoder.codeMergeIndex(cu, 0);
4615
0
    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
4616
0
    interMode.coeffBits = 0;
4617
0
    interMode.totalBits = interMode.mvBits + skipFlagBits;
4618
0
    if (m_rdCost.m_psyRd)
4619
0
        interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
4620
0
    else if(m_rdCost.m_ssimRd)
4621
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
4622
4623
0
    interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
4624
0
    updateModeCost(interMode);
4625
0
    m_entropyCoder.store(interMode.contexts);
4626
0
}
4627
4628
/* encode residual and calculate rate-distortion for a CU block.
4629
 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
4630
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
4631
0
{
4632
0
    ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
4633
4634
0
    CUData& cu = interMode.cu;
4635
0
    Yuv* reconYuv = &interMode.reconYuv;
4636
0
    Yuv* predYuv = &interMode.predYuv;
4637
0
    uint32_t depth = cuGeom.depth;
4638
0
    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
4639
0
    const Yuv* fencYuv = interMode.fencYuv;
4640
4641
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
4642
4643
0
    uint32_t log2CUSize = cuGeom.log2CUSize;
4644
0
    int sizeIdx = log2CUSize - 2;
4645
4646
0
    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
4647
4648
0
    uint32_t tuDepthRange[2];
4649
0
    cu.getInterTUQtDepthRange(tuDepthRange, 0);
4650
4651
0
    m_entropyCoder.load(m_rqt[depth].cur);
4652
4653
0
    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
4654
0
        m_maxTUDepth = -1;
4655
0
    else if (m_limitTU & X265_TU_LIMIT_BFS)
4656
0
        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
4657
4658
0
    Cost costs;
4659
0
    if (m_limitTU & X265_TU_LIMIT_NEIGH)
4660
0
    {
4661
        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
4662
0
        int32_t tempDepth = m_maxTUDepth;
4663
0
        if (m_maxTUDepth != -1)
4664
0
        {
4665
0
            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
4666
0
            uint32_t minSize = tuDepthRange[0];
4667
0
            uint32_t maxSize = tuDepthRange[1];
4668
0
            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
4669
0
            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
4670
0
        }
4671
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
4672
0
        m_maxTUDepth = tempDepth;
4673
0
    }
4674
0
    else
4675
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
4676
4677
0
    uint32_t tqBypass = cu.m_tqBypass[0];
4678
0
    if (!tqBypass)
4679
0
    {
4680
0
        sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
4681
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4682
0
        {
4683
0
            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
4684
0
            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
4685
0
        }
4686
4687
        /* Consider the RD cost of not signaling any residual */
4688
0
        m_entropyCoder.load(m_rqt[depth].cur);
4689
0
        m_entropyCoder.resetBits();
4690
0
        m_entropyCoder.codeQtRootCbfZero();
4691
0
        uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
4692
4693
0
        uint32_t cbf0Energy; uint64_t cbf0Cost;
4694
0
        if (m_rdCost.m_psyRd)
4695
0
        {
4696
0
            cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
4697
0
            cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
4698
0
        }
4699
0
        else if(m_rdCost.m_ssimRd)
4700
0
        {
4701
0
            cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
4702
0
            cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
4703
0
        }
4704
0
        else
4705
0
            cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
4706
4707
0
        if (cbf0Cost < costs.rdcost)
4708
0
        {
4709
0
            cu.clearCbf();
4710
0
            cu.setTUDepthSubParts(0, 0, depth);
4711
0
        }
4712
0
    }
4713
4714
0
    if (cu.getQtRootCbf(0))
4715
0
        saveResidualQTData(cu, *resiYuv, 0, 0);
4716
4717
    /* calculate signal bits for inter/merge/skip coded CU */
4718
0
    m_entropyCoder.load(m_rqt[depth].cur);
4719
4720
0
    m_entropyCoder.resetBits();
4721
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
4722
0
        m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
4723
4724
0
    uint32_t coeffBits, bits, mvBits;
4725
0
    if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
4726
0
    {
4727
0
        cu.setPredModeSubParts(MODE_SKIP);
4728
4729
        /* Merge/Skip */
4730
0
        coeffBits = mvBits = 0;
4731
0
        m_entropyCoder.codeSkipFlag(cu, 0);
4732
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
4733
0
        m_entropyCoder.codeMergeIndex(cu, 0);
4734
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
4735
0
        bits = mvBits + skipFlagBits;
4736
0
    }
4737
0
    else
4738
0
    {
4739
0
        m_entropyCoder.codeSkipFlag(cu, 0);
4740
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
4741
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
4742
0
        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
4743
0
        m_entropyCoder.codePredInfo(cu, 0);
4744
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
4745
4746
0
        bool bCodeDQP = m_slice->m_pps->bUseDQP;
4747
0
        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
4748
0
        bits = m_entropyCoder.getNumberOfWrittenBits();
4749
4750
0
        coeffBits = bits - mvBits - skipFlagBits;
4751
0
    }
4752
4753
0
    m_entropyCoder.store(interMode.contexts);
4754
4755
0
    if (cu.getQtRootCbf(0))
4756
0
        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
4757
0
    else
4758
0
        reconYuv->copyFromYuv(*predYuv);
4759
4760
    // update with clipped distortion and cost (qp estimation loop uses unclipped values)
4761
0
    sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
4762
0
    interMode.distortion = bestLumaDist;
4763
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4764
0
    {
4765
0
        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
4766
0
        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
4767
0
        interMode.chromaDistortion = bestChromaDist;
4768
0
        interMode.distortion += bestChromaDist;
4769
0
    }
4770
0
    if (m_rdCost.m_psyRd)
4771
0
        interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
4772
0
    else if(m_rdCost.m_ssimRd)
4773
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
4774
4775
0
    interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
4776
0
    interMode.totalBits = bits;
4777
0
    interMode.lumaDistortion = bestLumaDist;
4778
0
    interMode.coeffBits = coeffBits;
4779
0
    interMode.mvBits = mvBits;
4780
0
    cu.m_distortion[0] = interMode.distortion;
4781
0
    updateModeCost(interMode);
4782
0
    checkDQP(interMode, cuGeom);
4783
4784
#if ENABLE_SCC_EXT
4785
    if (m_param->bEnableSCC)
4786
        interMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
4787
#endif
4788
0
}
4789
4790
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
4791
0
{
4792
0
    uint32_t depth = cuGeom.depth + tuDepth;
4793
0
    CUData& cu = mode.cu;
4794
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
4795
4796
0
    bool bCheckFull = log2TrSize <= depthRange[1];
4797
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
4798
0
        bCheckFull = false;
4799
4800
0
    if (bCheckFull)
4801
0
    {
4802
        // code full block
4803
0
        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
4804
0
        uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
4805
4806
0
        uint32_t tuDepthC = tuDepth;
4807
0
        if (log2TrSizeC < 2)
4808
0
        {
4809
0
            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
4810
0
            log2TrSizeC = 2;
4811
0
            tuDepthC--;
4812
0
            codeChroma &= !(absPartIdx & 3);
4813
0
        }
4814
4815
0
        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
4816
0
        uint32_t setCbf = 1 << tuDepth;
4817
4818
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
4819
0
        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
4820
4821
0
        uint32_t sizeIdx  = log2TrSize  - 2;
4822
4823
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
4824
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
4825
4826
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
4827
0
        const Yuv* fencYuv = mode.fencYuv;
4828
4829
0
        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
4830
0
        uint32_t strideResiY = resiYuv.m_size;
4831
4832
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
4833
0
        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
4834
4835
0
        if (numSigY)
4836
0
        {
4837
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
4838
0
            cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
4839
0
        }
4840
0
        else
4841
0
        {
4842
0
            primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
4843
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
4844
0
        }
4845
4846
0
        if (codeChroma)
4847
0
        {
4848
0
            uint32_t sizeIdxC = log2TrSizeC - 2;
4849
0
            uint32_t strideResiC = resiYuv.m_csize;
4850
4851
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
4852
0
            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
4853
0
            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
4854
0
            bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
4855
4856
0
            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
4857
0
            do
4858
0
            {
4859
0
                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
4860
0
                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
4861
4862
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
4863
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
4864
4865
0
                int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
4866
0
                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
4867
0
                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
4868
0
                if (numSigU)
4869
0
                {
4870
0
                    m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
4871
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
4872
0
                }
4873
0
                else
4874
0
                {
4875
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
4876
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
4877
0
                }
4878
4879
0
                int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
4880
0
                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
4881
0
                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
4882
0
                if (numSigV)
4883
0
                {
4884
0
                    m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
4885
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
4886
0
                }
4887
0
                else
4888
0
                {
4889
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
4890
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
4891
0
                }
4892
0
            }
4893
0
            while (tuIterator.isNextSection());
4894
4895
0
            if (splitIntoSubTUs)
4896
0
            {
4897
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
4898
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
4899
0
            }
4900
0
        }
4901
0
    }
4902
0
    else
4903
0
    {
4904
0
        X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
4905
4906
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
4907
0
        uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
4908
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
4909
0
        {
4910
0
            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
4911
0
            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
4912
0
            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4913
0
            {
4914
0
                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
4915
0
                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
4916
0
            }
4917
0
        }
4918
0
        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
4919
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4920
0
        {
4921
0
            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
4922
0
            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
4923
0
        }
4924
0
    }
4925
0
}
4926
4927
uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
4928
0
{
4929
0
    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
4930
4931
0
    if (m_rdCost.m_psyRd)
4932
0
        return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
4933
0
    else if(m_rdCost.m_ssimRd)
4934
0
        return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
4935
0
    else
4936
0
        return m_rdCost.calcRdCost(dist, nullBits);
4937
0
}
4938
4939
bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
4940
0
{
4941
0
    CUData& cu = mode.cu;
4942
0
    uint32_t depth = cuGeom.depth + tuDepth;
4943
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
4944
4945
0
    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
4946
0
    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
4947
0
    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
4948
0
    {
4949
0
        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
4950
0
        {
4951
0
            m_maxTUDepth = cu.m_tuDepth[0];
4952
            // Fetch maximum TU depth of first sub partition to limit recursion of others
4953
0
            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
4954
0
                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
4955
0
        }
4956
0
        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
4957
0
        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
4958
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4959
0
        {
4960
0
            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
4961
0
            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
4962
0
        }
4963
0
    }
4964
0
    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
4965
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4966
0
    {
4967
0
        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
4968
0
        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
4969
0
    }
4970
4971
    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
4972
    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
4973
    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
4974
    // at depth 0 (for example).
4975
0
    m_entropyCoder.load(m_rqt[depth].rqtRoot);
4976
0
    m_entropyCoder.resetBits();
4977
0
    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
4978
0
    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
4979
0
    splitCost.bits += splitCbfBits;
4980
4981
0
    if (m_rdCost.m_psyRd)
4982
0
        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
4983
0
    else if(m_rdCost.m_ssimRd)
4984
0
        splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
4985
0
    else
4986
0
        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
4987
        
4988
0
    return ycbf || ucbf || vcbf;
4989
0
}
4990
4991
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
4992
0
{
4993
0
    CUData& cu = mode.cu;
4994
0
    uint32_t depth = cuGeom.depth + tuDepth;
4995
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
4996
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
4997
4998
0
    bool bCheckSplit = log2TrSize > depthRange[0];
4999
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5000
0
    bool bSaveTUData = false, bLoadTUData = false;
5001
0
    uint32_t idx = 0;
5002
5003
0
    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
5004
0
    {
5005
0
        if (bCheckSplit && bCheckFull && tuDepth)
5006
0
        {
5007
0
            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
5008
0
            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
5009
0
            idx = (depth - 1) * 4 + qIdx;
5010
0
            if (splitMore)
5011
0
            {
5012
0
                bLoadTUData = true;
5013
0
                bCheckFull = false;
5014
0
            }
5015
0
            else
5016
0
            {
5017
0
                bSaveTUData = true;
5018
0
                bCheckSplit = false;
5019
0
            }
5020
0
        }
5021
0
    }
5022
0
    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
5023
0
    {
5024
0
        if (bCheckSplit && m_maxTUDepth >= 0)
5025
0
        {
5026
0
            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
5027
0
            bCheckSplit = log2TrSize > log2MaxTrSize;
5028
0
        }
5029
0
    }
5030
5031
0
    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
5032
5033
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
5034
0
        bCheckFull = false;
5035
5036
0
    X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
5037
5038
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5039
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5040
0
    uint32_t tuDepthC = tuDepth;
5041
0
    if (log2TrSizeC < 2)
5042
0
    {
5043
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5044
0
        log2TrSizeC = 2;
5045
0
        tuDepthC--;
5046
0
        codeChroma &= !(absPartIdx & 3);
5047
0
    }
5048
5049
    // code full block
5050
0
    Cost fullCost;
5051
0
    fullCost.rdcost = MAX_INT64;
5052
5053
0
    uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5054
0
    uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5055
0
    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5056
0
    sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5057
0
    uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5058
0
    uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5059
0
    uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
5060
5061
0
    m_entropyCoder.store(m_rqt[depth].rqtRoot);
5062
5063
0
    uint32_t trSize = 1 << log2TrSize;
5064
0
    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5065
0
    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5066
0
    const Yuv* fencYuv = mode.fencYuv;
5067
5068
    // code full block
5069
0
    if (bCheckFull)
5070
0
    {
5071
0
        uint32_t trSizeC = 1 << log2TrSizeC;
5072
0
        int partSize = partitionFromLog2Size(log2TrSize);
5073
0
        int partSizeC = partitionFromLog2Size(log2TrSizeC);
5074
0
        const uint32_t qtLayer = log2TrSize - 2;
5075
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5076
0
        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
5077
5078
0
        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
5079
0
        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
5080
0
        bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
5081
5082
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5083
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5084
5085
0
        if (bEnableRDOQ)
5086
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5087
5088
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5089
0
        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
5090
0
        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5091
0
        cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
5092
5093
0
        m_entropyCoder.resetBits();
5094
5095
0
        if (bSplitPresentFlag && log2TrSize > depthRange[0])
5096
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
5097
5098
0
        if (cbfFlag[TEXT_LUMA][0])
5099
0
            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
5100
0
        singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
5101
5102
0
        X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
5103
5104
        //Assuming zero residual 
5105
0
        sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5106
0
        uint32_t zeroEnergyY = 0;
5107
0
        if (m_rdCost.m_psyRd)
5108
0
            zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5109
0
        else if(m_rdCost.m_ssimRd)
5110
0
            zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
5111
5112
0
        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
5113
0
        uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
5114
5115
0
        if (cbfFlag[TEXT_LUMA][0])
5116
0
        {
5117
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
5118
5119
            // non-zero cost calculation for luma - This is an approximation
5120
            // finally we have to encode correct cbf after comparing with null cost
5121
0
            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
5122
0
            bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
5123
0
            uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
5124
0
            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5125
0
            bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
5126
0
            bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
5127
0
            primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
5128
5129
0
            const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
5130
0
            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
5131
0
            uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
5132
0
            if (m_rdCost.m_psyRd)
5133
0
            {
5134
0
                nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
5135
0
                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5136
0
            }
5137
0
            else if(m_rdCost.m_ssimRd)
5138
0
            {
5139
0
                nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
5140
0
                singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5141
0
            }
5142
0
            else
5143
0
                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
5144
5145
0
            if (cu.m_tqBypass[0])
5146
0
            {
5147
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5148
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5149
0
            }
5150
0
            else
5151
0
            {
5152
                // zero-cost calculation for luma. This is an approximation
5153
                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
5154
                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
5155
0
                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5156
5157
0
                if (nullCostY < singleCostY)
5158
0
                {
5159
0
                    cbfFlag[TEXT_LUMA][0] = 0;
5160
0
                    singleBits[TEXT_LUMA][0] = 0;
5161
0
                    primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5162
#if CHECKED_BUILD || _DEBUG
5163
                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
5164
                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
5165
#endif
5166
0
                    if (checkTransformSkipY)
5167
0
                        minCost[TEXT_LUMA][0] = nullCostY;
5168
0
                    singleDist[TEXT_LUMA][0] = zeroDistY;
5169
0
                    singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5170
0
                }
5171
0
                else
5172
0
                {
5173
0
                    if (checkTransformSkipY)
5174
0
                        minCost[TEXT_LUMA][0] = singleCostY;
5175
0
                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
5176
0
                    singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5177
0
                }
5178
0
            }
5179
0
        }
5180
0
        else
5181
0
        {
5182
0
            if (checkTransformSkipY)
5183
0
                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5184
0
            primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5185
0
            singleDist[TEXT_LUMA][0] = zeroDistY;
5186
0
            singleBits[TEXT_LUMA][0] = 0;
5187
0
            singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5188
0
        }
5189
5190
0
        cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5191
5192
0
        if (codeChroma)
5193
0
        {
5194
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5195
0
            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
5196
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5197
0
            {
5198
0
                sse_t zeroDistC = 0;
5199
0
                uint32_t zeroEnergyC = 0;
5200
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5201
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5202
5203
0
                do
5204
0
                {
5205
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5206
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5207
5208
0
                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5209
5210
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5211
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5212
5213
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5214
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5215
0
                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
5216
0
                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
5217
5218
0
                    uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
5219
0
                    if (cbfFlag[chromaId][tuIterator.section])
5220
0
                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5221
5222
0
                    singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
5223
5224
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5225
0
                    zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
5226
5227
                    // Assuming zero residual 
5228
0
                    if (m_rdCost.m_psyRd)
5229
0
                        zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
5230
0
                    else if(m_rdCost.m_ssimRd)
5231
0
                        zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5232
5233
0
                    if (cbfFlag[chromaId][tuIterator.section])
5234
0
                    {
5235
0
                        m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
5236
0
                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
5237
5238
                        // non-zero cost calculation for luma, same as luma - This is an approximation
5239
                        // finally we have to encode correct cbf after comparing with null cost
5240
0
                        pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
5241
0
                        uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
5242
0
                        bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5243
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5244
0
                        bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5245
0
                        bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
5246
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
5247
0
                        sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
5248
0
                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
5249
0
                        uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
5250
0
                        if (m_rdCost.m_psyRd)
5251
0
                        {
5252
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
5253
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5254
0
                        }
5255
0
                        else if(m_rdCost.m_ssimRd)
5256
0
                        {
5257
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5258
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5259
0
                        }
5260
0
                        else
5261
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
5262
5263
0
                        if (cu.m_tqBypass[0])
5264
0
                        {
5265
0
                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5266
0
                            singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5267
0
                        }
5268
0
                        else
5269
0
                        {
5270
                            //zero-cost calculation for chroma. This is an approximation
5271
0
                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
5272
5273
0
                            if (nullCostC < singleCostC)
5274
0
                            {
5275
0
                                cbfFlag[chromaId][tuIterator.section] = 0;
5276
0
                                singleBits[chromaId][tuIterator.section] = 0;
5277
0
                                primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5278
#if CHECKED_BUILD || _DEBUG
5279
                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5280
                                memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
5281
#endif
5282
0
                                if (checkTransformSkipC)
5283
0
                                    minCost[chromaId][tuIterator.section] = nullCostC;
5284
0
                                singleDist[chromaId][tuIterator.section] = zeroDistC;
5285
0
                                singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5286
0
                            }
5287
0
                            else
5288
0
                            {
5289
0
                                if (checkTransformSkipC)
5290
0
                                    minCost[chromaId][tuIterator.section] = singleCostC;
5291
0
                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5292
0
                                singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5293
0
                            }
5294
0
                        }
5295
0
                    }
5296
0
                    else
5297
0
                    {
5298
0
                        if (checkTransformSkipC)
5299
0
                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
5300
0
                        primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5301
0
                        singleBits[chromaId][tuIterator.section] = 0;
5302
0
                        singleDist[chromaId][tuIterator.section] = zeroDistC;
5303
0
                        singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5304
0
                    }
5305
5306
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5307
0
                }
5308
0
                while (tuIterator.isNextSection());
5309
0
            }
5310
0
        }
5311
5312
0
        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
5313
0
        {
5314
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5315
0
            {
5316
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5317
0
                do
5318
0
                {
5319
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5320
0
                    cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5321
0
                }
5322
0
                while(tuIterator.isNextSection());
5323
0
            }
5324
0
        }
5325
0
        if (checkTransformSkipY)
5326
0
        {
5327
0
            sse_t nonZeroDistY = 0;
5328
0
            uint32_t nonZeroEnergyY = 0;
5329
0
            uint64_t singleCostY = MAX_INT64;
5330
5331
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5332
5333
0
            cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
5334
5335
0
            if (bEnableRDOQ)
5336
0
                m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5337
5338
0
            fenc = fencYuv->getLumaAddr(absPartIdx);
5339
0
            resi = resiYuv.getLumaAddr(absPartIdx);
5340
0
            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
5341
5342
0
            if (numSigTSkipY)
5343
0
            {
5344
0
                m_entropyCoder.resetBits();
5345
0
                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
5346
0
                m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
5347
0
                const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
5348
5349
0
                m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
5350
0
                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5351
5352
0
                bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
5353
0
                primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
5354
0
                nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
5355
5356
0
                if (m_rdCost.m_psyRd)
5357
0
                {
5358
0
                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
5359
0
                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5360
0
                }
5361
0
                else if(m_rdCost.m_ssimRd)
5362
0
                {
5363
0
                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
5364
0
                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5365
0
                }
5366
0
                else
5367
0
                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
5368
0
            }
5369
5370
0
            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
5371
0
                cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5372
0
            else
5373
0
            {
5374
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5375
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5376
0
                cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
5377
0
                bestTransformMode[TEXT_LUMA][0] = 1;
5378
0
                if (m_param->limitTU)
5379
0
                    numSig[TEXT_LUMA][0] = numSigTSkipY;
5380
0
                uint32_t numCoeffY = 1 << (log2TrSize << 1);
5381
0
                memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
5382
0
                primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
5383
0
            }
5384
5385
0
            cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5386
0
        }
5387
5388
0
        if (codeChroma && checkTransformSkipC)
5389
0
        {
5390
0
            sse_t nonZeroDistC = 0;
5391
0
            uint32_t nonZeroEnergyC = 0;
5392
0
            uint64_t singleCostC = MAX_INT64;
5393
0
            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
5394
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5395
5396
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5397
5398
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5399
0
            {
5400
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5401
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5402
5403
0
                do
5404
0
                {
5405
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5406
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5407
5408
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5409
5410
0
                    cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5411
5412
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5413
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5414
5415
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5416
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5417
0
                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
5418
5419
0
                    m_entropyCoder.resetBits();
5420
0
                    singleBits[chromaId][tuIterator.section] = 0;
5421
5422
0
                    if (numSigTSkipC)
5423
0
                    {
5424
0
                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
5425
0
                        m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5426
0
                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
5427
5428
0
                        m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
5429
0
                                                log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
5430
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5431
0
                        bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
5432
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
5433
0
                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
5434
0
                        if (m_rdCost.m_psyRd)
5435
0
                        {
5436
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
5437
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5438
0
                        }
5439
0
                        else if(m_rdCost.m_ssimRd)
5440
0
                        {
5441
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5442
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5443
0
                        }
5444
0
                        else
5445
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
5446
0
                    }
5447
5448
0
                    if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
5449
0
                        cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5450
0
                    else
5451
0
                    {
5452
0
                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5453
0
                        singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5454
0
                        cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
5455
0
                        bestTransformMode[chromaId][tuIterator.section] = 1;
5456
0
                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5457
0
                        memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
5458
0
                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
5459
0
                    }
5460
5461
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5462
0
                }
5463
0
                while (tuIterator.isNextSection());
5464
0
            }
5465
0
        }
5466
5467
        // Here we were encoding cbfs and coefficients, after calculating distortion above.
5468
        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
5469
        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
5470
        // make any difference. But bit confused whether I should load the original context as below.
5471
0
        m_entropyCoder.load(m_rqt[depth].rqtRoot);
5472
0
        m_entropyCoder.resetBits();
5473
5474
        //Encode cbf flags
5475
0
        if (codeChroma)
5476
0
        {
5477
0
            if (!splitIntoSubTUs)
5478
0
            {
5479
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5480
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5481
0
            }
5482
0
            else
5483
0
            {
5484
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5485
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5486
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5487
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
5488
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5489
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
5490
0
            }
5491
0
        }
5492
5493
0
        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
5494
5495
0
        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
5496
5497
0
        uint32_t coeffBits = 0;
5498
0
        coeffBits = singleBits[TEXT_LUMA][0];
5499
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5500
0
        {
5501
0
            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
5502
0
            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
5503
0
        }
5504
5505
        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
5506
        // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
5507
        // four split block's individual cbf value. This is not known before analysis of four split blocks.
5508
        // For that reason, I am collecting individual coefficient bits only.
5509
0
        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
5510
5511
0
        fullCost.distortion += singleDist[TEXT_LUMA][0];
5512
0
        fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
5513
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5514
0
        {
5515
0
            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
5516
0
            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
5517
0
        }
5518
5519
0
        if (m_rdCost.m_psyRd)
5520
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5521
0
        else if(m_rdCost.m_ssimRd)
5522
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5523
0
        else
5524
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
5525
5526
0
        if (m_param->limitTU && bCheckSplit)
5527
0
        {
5528
            // Stop recursion if the TU's energy level is minimal
5529
0
            uint32_t numCoeff = trSize * trSize;
5530
0
            if (cbfFlag[TEXT_LUMA][0] == 0)
5531
0
                bCheckSplit = false;
5532
0
            else if (numSig[TEXT_LUMA][0] < (numCoeff / 64))
5533
0
            {
5534
0
                uint32_t energy = 0;
5535
0
                for (uint32_t i = 0; i < numCoeff; i++)
5536
0
                    energy += abs(coeffCurY[i]);
5537
0
                if (energy == numSig[TEXT_LUMA][0])
5538
0
                    bCheckSplit = false;
5539
0
            }
5540
0
        }
5541
5542
0
        if (bSaveTUData)
5543
0
        {
5544
0
            for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5545
0
            {
5546
0
                for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5547
0
                {
5548
0
                    m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
5549
0
                    m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
5550
0
                }
5551
0
            }
5552
0
            m_cacheTU.cost[idx] = fullCost;
5553
0
            m_entropyCoder.store(m_cacheTU.rqtStore[idx]);
5554
0
        }
5555
0
    }
5556
0
    if (bLoadTUData)
5557
0
    {
5558
0
        for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5559
0
        {
5560
0
            for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5561
0
            {
5562
0
                bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part];
5563
0
                cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part];
5564
0
            }
5565
0
        }
5566
0
        fullCost = m_cacheTU.cost[idx];
5567
0
        m_entropyCoder.load(m_cacheTU.rqtStore[idx]);
5568
0
        bCheckFull = true;
5569
0
    }
5570
5571
    // code sub-blocks
5572
0
    if (bCheckSplit)
5573
0
    {
5574
0
        if (bCheckFull)
5575
0
        {
5576
0
            m_entropyCoder.store(m_rqt[depth].rqtTest);
5577
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5578
0
        }
5579
5580
0
        Cost splitCost;
5581
0
        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
5582
0
        {
5583
            // Subdiv flag can be encoded at the start of analysis of split blocks.
5584
0
            m_entropyCoder.resetBits();
5585
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
5586
0
            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
5587
0
        }
5588
5589
0
        bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
5590
0
        if (yCbCrCbf || !bCheckFull)
5591
0
        {
5592
0
            if (splitCost.rdcost < fullCost.rdcost)
5593
0
            {
5594
0
                if (m_limitTU & X265_TU_LIMIT_BFS)
5595
0
                {
5596
0
                    uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
5597
0
                    bool nextSplit = nextlog2TrSize > depthRange[0];
5598
0
                    if (nextSplit)
5599
0
                    {
5600
0
                        m_entropyCoder.load(m_rqt[depth].rqtRoot);
5601
0
                        splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
5602
0
                        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
5603
0
                        {
5604
                            // Subdiv flag can be encoded at the start of analysis of split blocks.
5605
0
                            m_entropyCoder.resetBits();
5606
0
                            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
5607
0
                            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
5608
0
                        }
5609
0
                        splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1);
5610
0
                    }
5611
0
                }
5612
0
                outCosts.distortion += splitCost.distortion;
5613
0
                outCosts.rdcost     += splitCost.rdcost;
5614
0
                outCosts.bits       += splitCost.bits;
5615
0
                outCosts.energy     += splitCost.energy;
5616
0
                return;
5617
0
            }
5618
0
            else
5619
0
                outCosts.energy     += splitCost.energy;
5620
0
        }
5621
5622
0
        cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
5623
0
        if (codeChroma)
5624
0
        {
5625
0
            if (!splitIntoSubTUs)
5626
0
            {
5627
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
5628
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
5629
0
            }
5630
0
            else
5631
0
            {
5632
0
                uint32_t tuNumParts = absPartIdxStep >> 1;
5633
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
5634
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
5635
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
5636
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
5637
0
            }
5638
0
        }
5639
0
        X265_CHECK(bCheckFull, "check-full must be set\n");
5640
0
        m_entropyCoder.load(m_rqt[depth].rqtTest);
5641
0
    }
5642
5643
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5644
0
    cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5645
5646
0
    if (codeChroma)
5647
0
    {
5648
0
        if (!splitIntoSubTUs)
5649
0
        {
5650
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
5651
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
5652
0
        }
5653
0
        else
5654
0
        {
5655
0
            uint32_t tuNumParts = absPartIdxStep >> 1;
5656
5657
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
5658
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
5659
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
5660
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
5661
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
5662
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
5663
0
        }
5664
0
    }
5665
5666
0
    outCosts.distortion += fullCost.distortion;
5667
0
    outCosts.rdcost     += fullCost.rdcost;
5668
0
    outCosts.bits       += fullCost.bits;
5669
0
    outCosts.energy     += fullCost.energy;
5670
0
}
5671
5672
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
5673
0
{
5674
0
    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
5675
5676
0
    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
5677
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
5678
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5679
0
    {
5680
0
        if (!(log2TrSize - m_hChromaShift < 2))
5681
0
        {
5682
0
            uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
5683
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
5684
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
5685
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
5686
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
5687
0
        }
5688
0
    }
5689
5690
0
    if (!bSubdiv)
5691
0
    {
5692
0
        m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
5693
0
    }
5694
0
    else
5695
0
    {
5696
0
        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
5697
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
5698
0
            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
5699
0
    }
5700
0
}
5701
5702
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
5703
0
{
5704
0
    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
5705
5706
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
5707
0
    {
5708
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5709
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
5710
0
            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
5711
0
        return;
5712
0
    }
5713
5714
0
    const uint32_t qtLayer = log2TrSize - 2;
5715
5716
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5717
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5718
0
    if (log2TrSizeC < 2)
5719
0
    {
5720
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5721
0
        log2TrSizeC = 2;
5722
0
        codeChroma &= !(absPartIdx & 3);
5723
0
    }
5724
5725
0
    m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
5726
5727
0
    uint32_t numCoeffY = 1 << (log2TrSize * 2);
5728
0
    uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
5729
0
    coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
5730
0
    coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
5731
0
    memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
5732
5733
0
    if (codeChroma)
5734
0
    {
5735
0
        m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
5736
5737
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
5738
0
        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5739
5740
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
5741
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
5742
0
        coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
5743
0
        coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
5744
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
5745
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
5746
0
    }
5747
0
}
5748
5749
/* returns the number of bits required to signal a non-most-probable mode.
5750
 * on return mpms contains bitmap of most probable modes */
5751
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
5752
0
{
5753
0
    cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
5754
5755
0
    mpms = 0;
5756
0
    for (int i = 0; i < 3; ++i)
5757
0
        mpms |= ((uint64_t)1 << mpmModes[i]);
5758
5759
0
    return m_entropyCoder.bitsIntraModeNonMPM();
5760
0
}
5761
5762
/* swap the current mode/cost with the mode with the highest cost in the
5763
 * current candidate list, if its cost is better (maintain a top N list) */
5764
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
5765
0
{
5766
0
    uint32_t maxIndex = 0;
5767
0
    uint64_t maxValue = 0;
5768
5769
0
    for (int i = 0; i < maxCandCount; i++)
5770
0
    {
5771
0
        if (maxValue < candCostList[i])
5772
0
        {
5773
0
            maxValue = candCostList[i];
5774
0
            maxIndex = i;
5775
0
        }
5776
0
    }
5777
5778
0
    if (cost < maxValue)
5779
0
    {
5780
0
        candCostList[maxIndex] = cost;
5781
0
        candModeList[maxIndex] = mode;
5782
0
    }
5783
0
}
5784
5785
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
5786
0
{
5787
0
    CUData& cu = mode.cu;
5788
0
    if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
5789
0
    {
5790
0
        if (cu.getQtRootCbf(0))
5791
0
        {
5792
0
            if (m_param->rdLevel >= 3)
5793
0
            {
5794
0
                mode.contexts.resetBits();
5795
0
                mode.contexts.codeDeltaQP(cu, 0);
5796
0
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
5797
0
                mode.totalBits += bits;
5798
0
                updateModeCost(mode);
5799
0
            }
5800
0
            else if (m_param->rdLevel <= 1)
5801
0
            {
5802
0
                mode.sa8dBits++;
5803
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
5804
0
            }
5805
0
            else
5806
0
            {
5807
0
                mode.totalBits++;
5808
0
                updateModeCost(mode);
5809
0
            }
5810
0
        }
5811
0
        else
5812
0
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
5813
0
    }
5814
0
}
5815
5816
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
5817
0
{
5818
0
    CUData& cu = mode.cu;
5819
5820
0
    if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
5821
0
    {
5822
0
        bool hasResidual = false;
5823
5824
        /* Check if any sub-CU has a non-zero QP */
5825
0
        for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
5826
0
        {
5827
0
            if (cu.getQtRootCbf(blkIdx))
5828
0
            {
5829
0
                hasResidual = true;
5830
0
                break;
5831
0
            }
5832
0
        }
5833
0
        if (hasResidual)
5834
0
        {
5835
0
            if (m_param->rdLevel >= 3)
5836
0
            {
5837
0
                mode.contexts.resetBits();
5838
0
                mode.contexts.codeDeltaQP(cu, 0);
5839
0
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
5840
0
                mode.totalBits += bits;
5841
0
                updateModeCost(mode);
5842
0
            }
5843
0
            else if (m_param->rdLevel <= 1)
5844
0
            {
5845
0
                mode.sa8dBits++;
5846
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
5847
0
            }
5848
0
            else
5849
0
            {
5850
0
                mode.totalBits++;
5851
0
                updateModeCost(mode);
5852
0
            }
5853
            /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
5854
            When the non-zero CBF sub-CU is found, stop */
5855
0
            cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
5856
0
        }
5857
0
        else
5858
            /* No residual within this CU or subCU, so reset QP to RefQP */
5859
0
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
5860
0
    }
5861
0
}