Coverage Report

Created: 2022-08-24 06:15

/src/x265/source/encoder/search.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Steve Borho <steve@borho.org>
5
*          Min Chen <chenm003@163.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 2 of the License, or
10
* (at your option) any later version.
11
*
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
* GNU General Public License for more details.
16
*
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
*
21
* This program is also available under a commercial proprietary license.
22
* For more information, contact us at license @ x265.com.
23
*****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "picyuv.h"
28
#include "cudata.h"
29
30
#include "search.h"
31
#include "entropy.h"
32
#include "rdcost.h"
33
34
#include "analysis.h"  // TLD
35
#include "framedata.h"
36
37
using namespace X265_NS;
38
39
#if _MSC_VER
40
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
41
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
42
#pragma warning(disable: 4127) // conditional expression is constant
43
#endif
44
45
0
#define MVP_IDX_BITS 1
46
47
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
48
49
Search::Search()
50
22.6k
{
51
22.6k
    memset(m_rqt, 0, sizeof(m_rqt));
52
53
90.5k
    for (int i = 0; i < 3; i++)
54
67.9k
    {
55
67.9k
        m_qtTempTransformSkipFlag[i] = NULL;
56
67.9k
        m_qtTempCbf[i] = NULL;
57
67.9k
    }
58
59
22.6k
    m_numLayers = 0;
60
22.6k
    m_intraPred = NULL;
61
22.6k
    m_intraPredAngs = NULL;
62
22.6k
    m_fencScaled = NULL;
63
22.6k
    m_fencTransposed = NULL;
64
22.6k
    m_tsCoeff = NULL;
65
22.6k
    m_tsResidual = NULL;
66
22.6k
    m_tsRecon = NULL;
67
22.6k
    m_param = NULL;
68
22.6k
    m_slice = NULL;
69
22.6k
    m_frame = NULL;
70
22.6k
    m_maxTUDepth = -1;
71
22.6k
}
72
73
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
74
22.6k
{
75
22.6k
    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
76
22.6k
    m_param = &param;
77
22.6k
    m_bFrameParallel = param.frameNumThreads > 1;
78
22.6k
    m_numLayers = g_log2Size[param.maxCUSize] - 2;
79
80
22.6k
    m_rdCost.setPsyRdScale(param.psyRd);
81
22.6k
    m_rdCost.setSsimRd(param.bSsimRd);
82
22.6k
    m_me.init(param.internalCsp);
83
84
22.6k
    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
85
22.6k
    if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
86
0
        ok &= m_quant.allocNoiseReduction(param);
87
88
22.6k
    ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
89
90
    /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
91
     * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
92
22.6k
    m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
93
94
22.6k
    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
95
22.6k
    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
96
22.6k
    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
97
98
22.6k
    m_limitTU = 0;
99
22.6k
    if (m_param->limitTU)
100
0
    {
101
0
        if (m_param->limitTU == 1)
102
0
            m_limitTU = X265_TU_LIMIT_BFS;
103
0
        else if (m_param->limitTU == 2)
104
0
            m_limitTU = X265_TU_LIMIT_DFS;
105
0
        else if (m_param->limitTU == 3)
106
0
            m_limitTU = X265_TU_LIMIT_NEIGH;
107
0
        else if (m_param->limitTU == 4)
108
0
            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
109
0
    }
110
111
    /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
112
     * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
113
     * which are reconstructed at each depth are valid. At the end, the transform depth table
114
     * is walked and the coeff and recon at the correct depths are collected */
115
116
22.6k
    if (param.internalCsp != X265_CSP_I400)
117
22.6k
    {
118
124k
        for (uint32_t i = 0; i <= m_numLayers; i++)
119
101k
        {
120
101k
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
121
101k
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
122
101k
            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
123
101k
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
124
101k
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
125
101k
        }
126
22.6k
    }
127
0
    else
128
0
    {
129
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
130
0
        {
131
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
132
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
133
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
134
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
135
0
        }
136
0
    }
137
138
    /* the rest of these buffers are indexed per-depth */
139
101k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
140
78.8k
    {
141
78.8k
        int cuSize = param.maxCUSize >> i;
142
78.8k
        ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
143
78.8k
        ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
144
78.8k
        ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
145
78.8k
        ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
146
78.8k
    }
147
148
22.6k
    if (param.internalCsp != X265_CSP_I400)
149
22.6k
    {
150
22.6k
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
151
22.6k
        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
152
22.6k
        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
153
22.6k
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
154
22.6k
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
155
22.6k
        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
156
22.6k
    }
157
0
    else
158
0
    {
159
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
160
0
        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
161
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
162
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
163
0
    }
164
165
22.6k
    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
166
22.6k
    m_fencScaled = m_intraPred + 32 * 32;
167
22.6k
    m_fencTransposed = m_fencScaled + 32 * 32;
168
22.6k
    m_intraPredAngs = m_fencTransposed + 32 * 32;
169
170
22.6k
    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
171
22.6k
    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
172
22.6k
    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
173
174
22.6k
    return ok;
175
176
0
fail:
177
0
    return false;
178
22.6k
}
179
180
Search::~Search()
181
22.6k
{
182
124k
    for (uint32_t i = 0; i <= m_numLayers; i++)
183
101k
    {
184
101k
        X265_FREE(m_rqt[i].coeffRQT[0]);
185
101k
        m_rqt[i].reconQtYuv.destroy();
186
101k
        m_rqt[i].resiQtYuv.destroy();
187
101k
    }
188
189
101k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
190
78.8k
    {
191
78.8k
        m_rqt[i].tmpResiYuv.destroy();
192
78.8k
        m_rqt[i].tmpPredYuv.destroy();
193
78.8k
        m_rqt[i].bidirPredYuv[0].destroy();
194
78.8k
        m_rqt[i].bidirPredYuv[1].destroy();
195
78.8k
    }
196
197
22.6k
    X265_FREE(m_qtTempCbf[0]);
198
22.6k
    X265_FREE(m_qtTempTransformSkipFlag[0]);
199
22.6k
    X265_FREE(m_intraPred);
200
22.6k
    X265_FREE(m_tsCoeff);
201
22.6k
    X265_FREE(m_tsResidual);
202
22.6k
    X265_FREE(m_tsRecon);
203
22.6k
}
204
205
int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
206
28.1k
{
207
28.1k
    X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
208
209
28.1k
    m_me.setQP(qp);
210
28.1k
    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
211
212
28.1k
    int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
213
28.1k
    m_quant.setQPforQuant(ctu, quantQP);
214
28.1k
    return quantQP;
215
28.1k
}
216
217
#if CHECKED_BUILD || _DEBUG
218
void Search::invalidateContexts(int fromDepth)
219
{
220
    /* catch reads without previous writes */
221
    for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
222
    {
223
        m_rqt[d].cur.markInvalid();
224
        m_rqt[d].rqtTemp.markInvalid();
225
        m_rqt[d].rqtRoot.markInvalid();
226
        m_rqt[d].rqtTest.markInvalid();
227
    }
228
}
229
#else
230
115k
void Search::invalidateContexts(int) {}
231
#endif
232
233
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
234
10.6M
{
235
10.6M
    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
236
10.6M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
237
238
10.6M
    if (!(log2TrSize - m_hChromaShift < 2))
239
3.88M
    {
240
3.88M
        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
241
3.88M
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
242
3.88M
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
243
3.88M
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
244
3.87M
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
245
3.88M
    }
246
247
10.6M
    if (subdiv)
248
1.69M
    {
249
1.69M
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
250
8.46M
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
251
6.77M
            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
252
1.69M
    }
253
10.6M
}
254
255
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
256
7.79M
{
257
7.79M
    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
258
7.72M
        return;
259
260
67.6k
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
261
262
67.6k
    if (tuDepth < cu.m_tuDepth[absPartIdx])
263
12.7k
    {
264
12.7k
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
265
63.5k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
266
50.8k
            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
267
268
12.7k
        return;
269
12.7k
    }
270
271
54.8k
    uint32_t tuDepthC = tuDepth;
272
54.8k
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
273
274
54.8k
    if (log2TrSizeC < 2)
275
36.0k
    {
276
36.0k
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
277
36.0k
        if (absPartIdx & 3)
278
27.0k
            return;
279
9.00k
        log2TrSizeC = 2;
280
9.00k
        tuDepthC--;
281
9.00k
    }
282
283
27.8k
    uint32_t qtLayer = log2TrSize - 2;
284
285
27.8k
    if (m_csp != X265_CSP_I422)
286
27.3k
    {
287
27.3k
        uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
288
27.3k
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
289
27.3k
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
290
27.3k
        m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
291
27.3k
    }
292
565
    else
293
565
    {
294
565
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
295
565
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
296
565
        uint32_t subTUSize = 1 << (log2TrSizeC * 2);
297
565
        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
298
565
        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
299
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
300
565
        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
301
0
            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
302
565
    }
303
27.8k
}
304
305
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
306
5.32M
{
307
5.32M
    CUData& cu = mode.cu;
308
5.32M
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
309
5.32M
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
310
5.32M
    uint32_t qtLayer    = log2TrSize - 2;
311
5.32M
    uint32_t sizeIdx    = log2TrSize - 2;
312
5.32M
    bool mightNotSplit  = log2TrSize <= depthRange[1];
313
5.32M
    bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
314
5.32M
    bool bEnableRDOQ  = !!m_param->rdoqLevel;
315
316
    /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
317
5.32M
    if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
318
0
    {
319
0
        mightNotSplit = false;
320
0
        mightSplit = true;
321
0
    }
322
323
5.32M
    Cost fullCost;
324
5.32M
    uint32_t bCBF = 0;
325
326
5.32M
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
327
5.32M
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
328
329
5.32M
    if (mightNotSplit)
330
5.31M
    {
331
5.31M
        if (mightSplit)
332
437k
            m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
333
334
5.31M
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
335
5.31M
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
336
5.31M
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
337
5.31M
        uint32_t stride   = mode.fencYuv->m_size;
338
339
        // init availability pattern
340
5.31M
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
341
5.31M
        IntraNeighbors intraNeighbors;
342
5.31M
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
343
5.31M
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
344
345
        // get prediction signal
346
5.31M
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
347
348
5.31M
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
349
5.31M
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
350
351
5.31M
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
352
5.31M
        coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
353
354
        // store original entropy coding status
355
5.31M
        if (bEnableRDOQ)
356
5.32M
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
357
5.31M
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
358
359
5.31M
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
360
5.31M
        if (numSig)
361
26.1k
        {
362
26.1k
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
363
26.1k
            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
364
26.1k
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
365
26.1k
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
366
26.1k
            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
367
26.1k
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
368
26.1k
        }
369
5.29M
        else
370
            // no coded residual, recon = pred
371
5.29M
            primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
372
373
5.31M
        bCBF = !!numSig << tuDepth;
374
5.31M
        cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
375
5.31M
        fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
376
377
5.31M
        m_entropyCoder.resetBits();
378
5.31M
        if (!absPartIdx)
379
2.00M
        {
380
2.00M
            if (!cu.m_slice->isIntra())
381
0
            {
382
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
383
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
384
0
                m_entropyCoder.codeSkipFlag(cu, 0);
385
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
386
0
            }
387
388
2.00M
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
389
2.00M
        }
390
5.31M
        if (cu.m_partSize[0] == SIZE_2Nx2N)
391
2.62M
        {
392
2.62M
            if (!absPartIdx)
393
1.32M
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
394
2.62M
        }
395
2.68M
        else
396
2.68M
        {
397
2.68M
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
398
2.68M
            if (!tuDepth)
399
0
            {
400
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
401
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
402
0
            }
403
2.68M
            else if (!(absPartIdx & (qNumParts - 1)))
404
2.70M
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
405
2.68M
        }
406
5.31M
        if (log2TrSize != depthRange[0])
407
886k
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
408
409
5.31M
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
410
411
5.31M
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
412
26.1k
            m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
413
414
5.31M
        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
415
416
5.31M
        if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
417
0
            fullCost.bits *= 4;
418
419
5.31M
        if (m_rdCost.m_psyRd)
420
5.30M
        {
421
5.30M
            fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
422
5.30M
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
423
5.30M
        }
424
10.0k
        else if(m_rdCost.m_ssimRd)
425
0
        {
426
0
            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
427
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
428
0
        }
429
10.0k
        else
430
10.0k
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
431
5.31M
    }
432
3.88k
    else
433
3.88k
        fullCost.rdcost = MAX_INT64;
434
435
5.32M
    if (mightSplit)
436
437k
    {
437
437k
        if (mightNotSplit)
438
437k
        {
439
437k
            m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
440
437k
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
441
437k
        }
442
443
        /* code split block */
444
437k
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
445
446
437k
        int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
447
437k
        if (m_param->bEnableTSkipFast)
448
0
            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
449
450
437k
        Cost splitCost;
451
437k
        uint32_t cbf = 0;
452
2.18M
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
453
1.74M
        {
454
1.74M
            if (checkTransformSkip)
455
0
                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
456
1.74M
            else
457
1.74M
                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
458
459
1.74M
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
460
1.74M
        }
461
437k
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
462
463
437k
        if (mightNotSplit && log2TrSize != depthRange[0])
464
437k
        {
465
            /* If we could have coded this TU depth, include cost of subdiv flag */
466
437k
            m_entropyCoder.resetBits();
467
437k
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
468
437k
            splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
469
470
437k
            if (m_rdCost.m_psyRd)
471
437k
                splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
472
17
            else if(m_rdCost.m_ssimRd)
473
0
                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
474
17
            else
475
17
                splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
476
437k
        }
477
478
437k
        if (splitCost.rdcost < fullCost.rdcost)
479
566
        {
480
566
            outCost.rdcost     += splitCost.rdcost;
481
566
            outCost.distortion += splitCost.distortion;
482
566
            outCost.bits       += splitCost.bits;
483
566
            outCost.energy     += splitCost.energy;
484
566
            return;
485
566
        }
486
436k
        else
487
436k
        {
488
            // recover entropy state of full-size TU encode
489
436k
            m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
490
491
            // recover transform index and Cbf values
492
436k
            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
493
436k
            cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
494
436k
            cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
495
436k
        }
496
437k
    }
497
498
    // set reconstruction for next intra prediction blocks if full TU prediction won
499
5.32M
    PicYuv*  reconPic = m_frame->m_reconPic;
500
5.32M
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
501
5.32M
    intptr_t picStride = reconPic->m_stride;
502
5.32M
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
503
504
5.32M
    outCost.rdcost     += fullCost.rdcost;
505
5.32M
    outCost.distortion += fullCost.distortion;
506
5.32M
    outCost.bits       += fullCost.bits;
507
5.32M
    outCost.energy     += fullCost.energy;
508
5.32M
}
509
510
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
511
0
{
512
0
    uint32_t fullDepth = cuGeom.depth + tuDepth;
513
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
514
0
    uint32_t tuSize = 1 << log2TrSize;
515
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
516
517
0
    X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
518
519
0
    CUData& cu = mode.cu;
520
0
    Yuv* predYuv = &mode.predYuv;
521
0
    const Yuv* fencYuv = mode.fencYuv;
522
523
0
    Cost fullCost;
524
0
    fullCost.rdcost = MAX_INT64;
525
0
    int      bTSkip = 0;
526
0
    uint32_t bCBF = 0;
527
528
0
    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
529
0
    pixel*   pred = predYuv->getLumaAddr(absPartIdx);
530
0
    int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
531
0
    uint32_t stride = fencYuv->m_size;
532
0
    uint32_t sizeIdx = log2TrSize - 2;
533
534
    // init availability pattern
535
0
    uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
536
0
    IntraNeighbors intraNeighbors;
537
0
    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
538
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
539
540
    // get prediction signal
541
0
    predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
542
543
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
544
545
0
    uint32_t qtLayer = log2TrSize - 2;
546
0
    uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
547
0
    coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
548
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
549
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
550
551
    // store original entropy coding status
552
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
553
554
0
    if (bEnableRDOQ)
555
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
556
557
0
    int checkTransformSkip = 1;
558
0
    for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
559
0
    {
560
0
        uint64_t tmpCost;
561
0
        uint32_t tmpEnergy = 0;
562
563
0
        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
564
0
        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
565
0
        bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
566
0
        uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
567
568
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
569
570
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
571
0
        if (numSig)
572
0
        {
573
0
            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
574
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
575
0
            bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
576
0
            bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
577
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
578
0
        }
579
0
        else if (useTSkip)
580
0
        {
581
            /* do not allow tskip if CBF=0, pretend we did not try tskip */
582
0
            checkTransformSkip = 0;
583
0
            break;
584
0
        }
585
0
        else
586
            // no residual coded, recon = pred
587
0
            primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
588
589
0
        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
590
591
0
        cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
592
0
        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
593
594
0
        if (useTSkip)
595
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
596
597
0
        m_entropyCoder.resetBits();
598
0
        if (!absPartIdx)
599
0
        {
600
0
            if (!cu.m_slice->isIntra())
601
0
            {
602
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
603
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
604
0
                m_entropyCoder.codeSkipFlag(cu, 0);
605
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
606
0
            }
607
608
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
609
0
        }
610
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
611
0
        {
612
0
            if (!absPartIdx)
613
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
614
0
        }
615
0
        else
616
0
        {
617
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
618
0
            if (!tuDepth)
619
0
            {
620
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
621
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
622
0
            }
623
0
            else if (!(absPartIdx & (qNumParts - 1)))
624
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
625
0
        }
626
0
        m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
627
628
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
629
630
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
631
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
632
633
0
        uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
634
635
0
        if (!useTSkip)
636
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
637
638
0
        if (m_rdCost.m_psyRd)
639
0
        {
640
0
            tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
641
0
            tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
642
0
        }
643
0
        else if(m_rdCost.m_ssimRd)
644
0
        {
645
0
            tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx);
646
0
            tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
647
0
        }
648
0
        else
649
0
            tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
650
651
0
        if (tmpCost < fullCost.rdcost)
652
0
        {
653
0
            bTSkip = useTSkip;
654
0
            bCBF = !!numSig;
655
0
            fullCost.rdcost = tmpCost;
656
0
            fullCost.distortion = tmpDist;
657
0
            fullCost.bits = tmpBits;
658
0
            fullCost.energy = tmpEnergy;
659
0
        }
660
0
    }
661
662
0
    if (bTSkip)
663
0
    {
664
0
        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
665
0
        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
666
0
    }
667
0
    else if (checkTransformSkip)
668
0
    {
669
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
670
0
        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
671
0
        m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
672
0
    }
673
674
    // set reconstruction for next intra prediction blocks
675
0
    PicYuv*  reconPic = m_frame->m_reconPic;
676
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
677
0
    intptr_t picStride = reconPic->m_stride;
678
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
679
680
0
    outCost.rdcost += fullCost.rdcost;
681
0
    outCost.distortion += fullCost.distortion;
682
0
    outCost.bits += fullCost.bits;
683
0
    outCost.energy += fullCost.energy;
684
0
}
685
686
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
687
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
688
0
{
689
0
    CUData& cu = mode.cu;
690
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
691
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
692
0
    bool     bCheckFull = log2TrSize <= depthRange[1];
693
694
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
695
696
    /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
697
     * since we are not measuring RD cost */
698
0
    if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
699
0
        bCheckFull = false;
700
701
0
    if (bCheckFull)
702
0
    {
703
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
704
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
705
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
706
0
        uint32_t stride   = mode.fencYuv->m_size;
707
708
        // init availability pattern
709
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
710
0
        IntraNeighbors intraNeighbors;
711
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
712
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
713
714
        // get prediction signal
715
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
716
717
0
        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
718
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
719
720
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
721
0
        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
722
723
0
        uint32_t sizeIdx   = log2TrSize - 2;
724
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
725
726
0
        PicYuv*  reconPic = m_frame->m_reconPic;
727
0
        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
728
0
        intptr_t picStride = reconPic->m_stride;
729
730
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
731
0
        if (numSig)
732
0
        {
733
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
734
0
            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
735
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
736
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
737
0
            bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
738
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
739
0
            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
740
0
        }
741
0
        else
742
0
        {
743
0
            primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
744
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
745
0
        }
746
0
    }
747
0
    else
748
0
    {
749
0
        X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
750
751
        /* code split block */
752
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
753
0
        uint32_t cbf = 0;
754
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
755
0
        {
756
0
            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
757
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
758
0
        }
759
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
760
0
    }
761
0
}
762
763
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
764
1.79M
{
765
1.79M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
766
767
1.79M
    if (tuDepth == cu.m_tuDepth[absPartIdx])
768
1.79M
    {
769
1.79M
        uint32_t qtLayer    = log2TrSize - 2;
770
771
        // copy transform coefficients
772
1.79M
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
773
1.79M
        coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
774
1.79M
        coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
775
1.79M
        memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
776
777
        // copy reconstruction
778
1.79M
        m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
779
1.79M
    }
780
505
    else
781
505
    {
782
505
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
783
2.76k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
784
2.26k
            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
785
505
    }
786
1.79M
}
787
788
inline void offsetCBFs(uint8_t subTUCBF[2])
789
0
{
790
0
    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
791
0
    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
792
0
    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
793
0
}
794
795
/* 4:2:2 post-TU split processing */
796
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
797
0
{
798
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
799
800
0
    if (log2TrSize == 2)
801
0
    {
802
0
        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
803
0
        ++log2TrSize;
804
0
    }
805
806
0
    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
807
808
    // move the CBFs down a level and set the parent CBF
809
0
    uint8_t subTUCBF[2];
810
0
    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
811
0
    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
812
0
    offsetCBFs(subTUCBF);
813
814
0
    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
815
0
    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
816
0
}
817
818
/* returns distortion */
819
void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
820
10.5M
{
821
10.5M
    CUData& cu = mode.cu;
822
10.5M
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
823
10.5M
    bool bEnableRDOQ = !!m_param->rdoqLevel;
824
825
10.5M
    if (tuDepth < cu.m_tuDepth[absPartIdx])
826
1.69M
    {
827
1.69M
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
828
1.69M
        uint32_t splitCbfU = 0, splitCbfV = 0;
829
8.45M
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
830
6.76M
        {
831
6.76M
            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
832
6.76M
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
833
6.76M
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
834
6.76M
        }
835
1.69M
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
836
1.69M
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
837
838
1.69M
        return;
839
1.69M
    }
840
841
8.90M
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
842
8.90M
    uint32_t tuDepthC = tuDepth;
843
8.90M
    if (log2TrSizeC < 2)
844
6.75M
    {
845
6.75M
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
846
6.75M
        if (absPartIdx & 3)
847
5.07M
            return;
848
1.67M
        log2TrSizeC = 2;
849
1.67M
        tuDepthC--;
850
1.67M
    }
851
852
3.82M
    if (bEnableRDOQ)
853
3.87M
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
854
855
3.82M
    bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
856
3.82M
    checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
857
3.82M
    if (checkTransformSkip)
858
0
    {
859
0
        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
860
0
        return;
861
0
    }
862
863
3.82M
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
864
3.82M
    uint32_t qtLayer = log2TrSize - 2;
865
3.82M
    uint32_t stride = mode.fencYuv->m_csize;
866
3.82M
    const uint32_t sizeIdxC = log2TrSizeC - 2;
867
868
3.82M
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
869
3.82M
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
870
871
3.82M
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
872
3.82M
    do
873
3.82M
    {
874
3.82M
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
875
876
3.82M
        IntraNeighbors intraNeighbors;
877
3.82M
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
878
879
11.5M
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
880
7.73M
        {
881
7.73M
            TextType ttype = (TextType)chromaId;
882
883
7.73M
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
884
7.73M
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
885
7.73M
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
886
7.73M
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
887
7.73M
            coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
888
7.73M
            pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
889
7.73M
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
890
7.73M
            PicYuv*  reconPic = m_frame->m_reconPic;
891
7.73M
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
892
7.73M
            intptr_t picStride = reconPic->m_strideC;
893
894
7.73M
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
895
7.73M
            if (chromaPredMode == DM_CHROMA_IDX)
896
1.55M
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
897
7.73M
            if (m_csp == X265_CSP_I422)
898
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
899
900
            // init availability pattern
901
7.73M
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
902
903
            // get prediction signal
904
7.73M
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
905
7.73M
            cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
906
907
7.73M
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
908
909
7.73M
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
910
7.73M
            if (numSig)
911
27.3k
            {
912
27.3k
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
913
27.3k
                bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
914
27.3k
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
915
27.3k
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
916
27.3k
                bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
917
27.3k
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
918
27.3k
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
919
27.3k
            }
920
7.70M
            else
921
7.70M
            {
922
                // no coded residual, recon = pred
923
7.70M
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
924
7.70M
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
925
7.70M
            }
926
927
7.73M
            outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
928
929
7.73M
            if (m_rdCost.m_psyRd)
930
7.73M
                outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
931
18.4E
            else if(m_rdCost.m_ssimRd)
932
0
                outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
933
934
7.73M
            primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
935
7.73M
        }
936
3.82M
    }
937
3.82M
    while (tuIterator.isNextSection());
938
939
3.82M
    if (splitType == VERTICAL_SPLIT)
940
0
    {
941
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
942
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
943
0
    }
944
3.82M
}
945
946
/* returns distortion */
947
void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
948
0
{
949
0
    CUData& cu = mode.cu;
950
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
951
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
952
0
    const uint32_t log2TrSizeC = 2;
953
0
    uint32_t qtLayer = log2TrSize - 2;
954
955
    /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
956
     * so the entropy coder is not very accurate. The best we can do is return it in the same
957
     * condition as it arrived, and to do all bit estimates from the same state. */
958
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
959
960
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
961
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
962
963
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
964
0
    do
965
0
    {
966
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
967
968
0
        IntraNeighbors intraNeighbors;
969
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
970
971
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
972
0
        {
973
0
            TextType ttype = (TextType)chromaId;
974
975
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
976
0
            pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
977
0
            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
978
0
            uint32_t stride = mode.fencYuv->m_csize;
979
0
            const uint32_t sizeIdxC = log2TrSizeC - 2;
980
981
0
            uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
982
0
            coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
983
0
            pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
984
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
985
986
            // init availability pattern
987
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
988
989
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
990
0
            if (chromaPredMode == DM_CHROMA_IDX)
991
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
992
0
            if (m_csp == X265_CSP_I422)
993
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
994
995
            // get prediction signal
996
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
997
998
0
            uint64_t bCost = MAX_INT64;
999
0
            sse_t bDist = 0;
1000
0
            uint32_t bCbf = 0;
1001
0
            uint32_t bEnergy = 0;
1002
0
            int      bTSkip = 0;
1003
1004
0
            int checkTransformSkip = 1;
1005
0
            for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
1006
0
            {
1007
0
                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
1008
0
                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
1009
0
                uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
1010
1011
0
                primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1012
1013
0
                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
1014
0
                if (numSig)
1015
0
                {
1016
0
                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
1017
0
                    bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
1018
0
                    bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1019
0
                    bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1020
0
                    bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
1021
0
                    primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
1022
0
                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1023
0
                }
1024
0
                else if (useTSkip)
1025
0
                {
1026
0
                    checkTransformSkip = 0;
1027
0
                    break;
1028
0
                }
1029
0
                else
1030
0
                {
1031
0
                    primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
1032
0
                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1033
0
                }
1034
0
                sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
1035
0
                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
1036
1037
0
                cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1038
1039
0
                uint32_t tmpBits = 0, tmpEnergy = 0;
1040
0
                if (numSig)
1041
0
                {
1042
0
                    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1043
0
                    m_entropyCoder.resetBits();
1044
0
                    m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
1045
0
                    tmpBits = m_entropyCoder.getNumberOfWrittenBits();
1046
0
                }
1047
1048
0
                uint64_t tmpCost;
1049
0
                if (m_rdCost.m_psyRd)
1050
0
                {
1051
0
                    tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1052
0
                    tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
1053
0
                }
1054
0
                else if(m_rdCost.m_ssimRd)
1055
0
                {
1056
0
                    tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1057
0
                    tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
1058
0
                }
1059
0
                else
1060
0
                    tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
1061
1062
0
                if (tmpCost < bCost)
1063
0
                {
1064
0
                    bCost = tmpCost;
1065
0
                    bDist = tmpDist;
1066
0
                    bTSkip = useTSkip;
1067
0
                    bCbf = !!numSig;
1068
0
                    bEnergy = tmpEnergy;
1069
0
                }
1070
0
            }
1071
1072
0
            if (bTSkip)
1073
0
            {
1074
0
                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
1075
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
1076
0
            }
1077
1078
0
            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1079
0
            cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1080
1081
0
            PicYuv*  reconPic = m_frame->m_reconPic;
1082
0
            pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1083
0
            intptr_t picStride = reconPic->m_strideC;
1084
0
            primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
1085
1086
0
            outCost.distortion += bDist;
1087
0
            outCost.energy += bEnergy;
1088
0
        }
1089
0
    }
1090
0
    while (tuIterator.isNextSection());
1091
1092
0
    if (splitType == VERTICAL_SPLIT)
1093
0
    {
1094
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1095
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1096
0
    }
1097
1098
0
    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1099
0
}
1100
1101
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
1102
1.18M
{
1103
1.18M
    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1104
1.18M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1105
1.18M
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1106
1107
1.18M
    if (tuDepthL == tuDepth || log2TrSizeC == 2)
1108
1.18M
    {
1109
        // copy transform coefficients
1110
1.18M
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1111
1.18M
        uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1112
1113
1.18M
        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
1114
1.18M
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1115
1.18M
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1116
1.18M
        coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1117
1.18M
        coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1118
1.18M
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1119
1.18M
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1120
1121
        // copy reconstruction
1122
1.18M
        m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1123
1.18M
    }
1124
359
    else
1125
359
    {
1126
359
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1127
1.84k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1128
1.48k
            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1129
359
    }
1130
1.18M
}
1131
1132
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
1133
0
{
1134
0
    CUData& cu = mode.cu;
1135
0
    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
1136
1137
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1138
0
    {
1139
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1140
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
1141
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1142
0
        {
1143
0
            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
1144
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1145
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1146
0
        }
1147
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1148
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1149
1150
0
        return;
1151
0
    }
1152
1153
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1154
0
    uint32_t tuDepthC = tuDepth;
1155
0
    if (log2TrSizeC < 2)
1156
0
    {
1157
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1158
0
        if (absPartIdx & 3)
1159
0
            return;
1160
0
        log2TrSizeC = 2;
1161
0
        tuDepthC--;
1162
0
    }
1163
1164
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1165
0
    uint32_t stride = mode.fencYuv->m_csize;
1166
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1167
1168
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1169
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1170
1171
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1172
0
    do
1173
0
    {
1174
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1175
1176
0
        IntraNeighbors intraNeighbors;
1177
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1178
1179
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1180
0
        {
1181
0
            TextType ttype = (TextType)chromaId;
1182
1183
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1184
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1185
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1186
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1187
0
            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
1188
0
            PicYuv*  reconPic = m_frame->m_reconPic;
1189
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1190
0
            intptr_t picStride = reconPic->m_strideC;
1191
1192
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1193
0
            if (chromaPredMode == DM_CHROMA_IDX)
1194
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1195
0
            if (m_csp == X265_CSP_I422)
1196
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1197
1198
            // init availability pattern
1199
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1200
1201
            // get prediction signal
1202
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1203
1204
0
            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1205
1206
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1207
1208
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1209
0
            if (numSig)
1210
0
            {
1211
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1212
0
                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
1213
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1214
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
1215
0
                bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
1216
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
1217
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1218
0
            }
1219
0
            else
1220
0
            {
1221
                // no coded residual, recon = pred
1222
0
                primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
1223
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1224
0
            }
1225
0
        }
1226
0
    }
1227
0
    while (tuIterator.isNextSection());
1228
1229
0
    if (splitType == VERTICAL_SPLIT)
1230
0
    {
1231
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1232
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1233
0
    }
1234
0
}
1235
1236
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
1237
775k
{
1238
775k
    CUData& cu = intraMode.cu;
1239
1240
775k
    cu.setPartSizeSubParts(partSize);
1241
775k
    cu.setPredModeSubParts(MODE_INTRA);
1242
1243
775k
    uint32_t tuDepthRange[2];
1244
775k
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1245
1246
775k
    intraMode.initCosts();
1247
775k
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
1248
775k
    if (m_csp != X265_CSP_I400)
1249
775k
    {
1250
775k
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
1251
775k
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
1252
775k
    }
1253
18.4E
    else
1254
18.4E
        intraMode.distortion += intraMode.lumaDistortion;
1255
775k
    cu.m_distortion[0] = intraMode.distortion;
1256
775k
    m_entropyCoder.resetBits();
1257
775k
    if (m_slice->m_pps->bTransquantBypassEnabled)
1258
240k
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1259
1260
775k
    int skipFlagBits = 0;
1261
775k
    if (!m_slice->isIntra())
1262
0
    {
1263
0
        m_entropyCoder.codeSkipFlag(cu, 0);
1264
0
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1265
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
1266
0
    }
1267
1268
775k
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1269
775k
    m_entropyCoder.codePredInfo(cu, 0);
1270
775k
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1271
1272
775k
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1273
775k
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1274
775k
    m_entropyCoder.store(intraMode.contexts);
1275
775k
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1276
775k
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1277
775k
    const Yuv* fencYuv = intraMode.fencYuv;
1278
775k
    if (m_rdCost.m_psyRd)
1279
775k
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1280
18.4E
    else if(m_rdCost.m_ssimRd)
1281
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1282
1283
775k
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1284
1285
775k
    updateModeCost(intraMode);
1286
775k
    checkDQP(intraMode, cuGeom);
1287
775k
}
1288
1289
/* Note that this function does not save the best intra prediction, it must
1290
 * be generated later. It records the best mode in the cu */
1291
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1292
0
{
1293
0
    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1294
1295
0
    CUData& cu = intraMode.cu;
1296
0
    uint32_t depth = cuGeom.depth;
1297
1298
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
1299
0
    cu.setPredModeSubParts(MODE_INTRA);
1300
1301
0
    const uint32_t initTuDepth = 0;
1302
0
    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
1303
0
    uint32_t tuSize = 1 << log2TrSize;
1304
0
    const uint32_t absPartIdx = 0;
1305
1306
    // Reference sample smoothing
1307
0
    IntraNeighbors intraNeighbors;
1308
0
    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1309
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1310
1311
0
    const pixel* fenc = intraMode.fencYuv->m_buf[0];
1312
0
    uint32_t stride = intraMode.fencYuv->m_size;
1313
1314
0
    int sad, bsad;
1315
0
    uint32_t bits, bbits, mode, bmode;
1316
0
    uint64_t cost, bcost;
1317
1318
    // 33 Angle modes once
1319
0
    int scaleTuSize = tuSize;
1320
0
    int scaleStride = stride;
1321
0
    int costShift = 0;
1322
0
    int sizeIdx = log2TrSize - 2;
1323
1324
0
    if (tuSize > 32)
1325
0
    {
1326
        // CU is 64x64, we scale to 32x32 and adjust required parameters
1327
0
        primitives.scale2D_64to32(m_fencScaled, fenc, stride);
1328
0
        fenc = m_fencScaled;
1329
1330
0
        pixel nScale[129];
1331
0
        intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
1332
0
        primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
1333
1334
        // we do not estimate filtering for downscaled samples
1335
0
        memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));   // Top & Left pixels
1336
0
        memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
1337
1338
0
        scaleTuSize = 32;
1339
0
        scaleStride = 32;
1340
0
        costShift = 2;
1341
0
        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1342
0
    }
1343
1344
0
    pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1345
0
    int predsize = scaleTuSize * scaleTuSize;
1346
1347
0
    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1348
1349
    /* there are three cost tiers for intra modes:
1350
     *  pred[0]          - mode probable, least cost
1351
     *  pred[1], pred[2] - less probable, slightly more cost
1352
     *  non-mpm modes    - all cost the same (rbits) */
1353
0
    uint64_t mpms;
1354
0
    uint32_t mpmModes[3];
1355
0
    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1356
1357
    // DC
1358
0
    primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1359
0
    bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1360
0
    bmode = mode = DC_IDX;
1361
0
    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1362
0
    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1363
1364
    // PLANAR
1365
0
    pixel* planar = intraNeighbourBuf[0];
1366
0
    if (tuSize & (8 | 16 | 32))
1367
0
        planar = intraNeighbourBuf[1];
1368
1369
0
    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
1370
0
    sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1371
0
    mode = PLANAR_IDX;
1372
0
    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1373
0
    cost = m_rdCost.calcRdSADCost(sad, bits);
1374
0
    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1375
1376
0
    bool allangs = true;
1377
0
    if (primitives.cu[sizeIdx].intra_pred_allangs)
1378
0
    {
1379
0
        primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1380
0
        primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
1381
0
    }
1382
0
    else
1383
0
        allangs = false;
1384
1385
0
#define TRY_ANGLE(angle) \
1386
0
    if (allangs) { \
1387
0
        if (angle < 18) \
1388
0
            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1389
0
        else \
1390
0
            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1391
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1392
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1393
0
    } else { \
1394
0
        int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
1395
0
        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
1396
0
        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
1397
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1398
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1399
0
    }
1400
1401
0
    if (m_param->bEnableFastIntra)
1402
0
    {
1403
0
        int asad = 0;
1404
0
        uint32_t lowmode, highmode, amode = 5, abits = 0;
1405
0
        uint64_t acost = MAX_INT64;
1406
1407
        /* pick the best angle, sampling at distance of 5 */
1408
0
        for (mode = 5; mode < 35; mode += 5)
1409
0
        {
1410
0
            TRY_ANGLE(mode);
1411
0
            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1412
0
        }
1413
1414
        /* refine best angle at distance 2, then distance 1 */
1415
0
        for (uint32_t dist = 2; dist >= 1; dist--)
1416
0
        {
1417
0
            lowmode = amode - dist;
1418
0
            highmode = amode + dist;
1419
1420
0
            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1421
0
            TRY_ANGLE(lowmode);
1422
0
            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1423
1424
0
            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1425
0
            TRY_ANGLE(highmode);
1426
0
            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1427
0
        }
1428
1429
0
        if (amode == 33)
1430
0
        {
1431
0
            TRY_ANGLE(34);
1432
0
            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1433
0
        }
1434
1435
0
        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1436
0
    }
1437
0
    else // calculate and search all intra prediction angles for lowest cost
1438
0
    {
1439
0
        for (mode = 2; mode < 35; mode++)
1440
0
        {
1441
0
            TRY_ANGLE(mode);
1442
0
            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1443
0
        }
1444
0
    }
1445
1446
0
    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1447
0
    intraMode.initCosts();
1448
0
    intraMode.totalBits = bbits;
1449
0
    intraMode.distortion = bsad;
1450
0
    intraMode.sa8dCost = bcost;
1451
0
    intraMode.sa8dBits = bbits;
1452
0
}
1453
1454
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1455
0
{
1456
0
    ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1457
1458
0
    CUData& cu = intraMode.cu;
1459
0
    Yuv* reconYuv = &intraMode.reconYuv;
1460
1461
0
    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1462
0
    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1463
1464
0
    uint32_t tuDepthRange[2];
1465
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1466
1467
0
    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1468
1469
0
    Cost icosts;
1470
0
    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1471
0
    extractIntraResultQT(cu, *reconYuv, 0, 0);
1472
1473
0
    intraMode.lumaDistortion = icosts.distortion;
1474
0
    if (m_csp != X265_CSP_I400)
1475
0
    {
1476
0
        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
1477
0
        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
1478
0
    }
1479
0
    else
1480
0
        intraMode.distortion = intraMode.lumaDistortion;
1481
1482
0
    m_entropyCoder.resetBits();
1483
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1484
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1485
0
    m_entropyCoder.codeSkipFlag(cu, 0);
1486
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1487
0
    m_entropyCoder.codePredMode(cu.m_predMode[0]);
1488
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1489
0
    m_entropyCoder.codePredInfo(cu, 0);
1490
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1491
1492
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1493
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1494
1495
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1496
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1497
0
    const Yuv* fencYuv = intraMode.fencYuv;
1498
0
    if (m_rdCost.m_psyRd)
1499
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1500
0
    else if(m_rdCost.m_ssimRd)
1501
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1502
1503
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1504
0
    m_entropyCoder.store(intraMode.contexts);
1505
0
    updateModeCost(intraMode);
1506
0
    checkDQP(intraMode, cuGeom);
1507
0
}
1508
1509
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
1510
775k
{
1511
775k
    CUData& cu = intraMode.cu;
1512
775k
    Yuv* reconYuv = &intraMode.reconYuv;
1513
775k
    Yuv* predYuv = &intraMode.predYuv;
1514
775k
    const Yuv* fencYuv = intraMode.fencYuv;
1515
1516
775k
    uint32_t depth        = cuGeom.depth;
1517
775k
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
1518
775k
    uint32_t numPU        = 1 << (2 * initTuDepth);
1519
775k
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
1520
775k
    uint32_t tuSize       = 1 << log2TrSize;
1521
775k
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1522
775k
    uint32_t sizeIdx      = log2TrSize - 2;
1523
775k
    uint32_t absPartIdx   = 0;
1524
775k
    sse_t totalDistortion = 0;
1525
1526
775k
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1527
1528
    // loop over partitions
1529
2.56M
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1530
1.78M
    {
1531
1.78M
        uint32_t bmode = 0;
1532
1533
1.78M
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
1534
0
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
1535
1.78M
        else
1536
1.78M
        {
1537
1.78M
            uint64_t candCostList[MAX_RD_INTRA_MODES];
1538
1.78M
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
1539
1.78M
            uint64_t bcost;
1540
1.78M
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1541
1542
1.78M
            {
1543
1.78M
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1544
1545
                // Reference sample smoothing
1546
1.78M
                IntraNeighbors intraNeighbors;
1547
1.78M
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1548
1.78M
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1549
1550
                // determine set of modes to be tested (using prediction signal only)
1551
1.78M
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1552
1.78M
                uint32_t stride = predYuv->m_size;
1553
1554
1.78M
                int scaleTuSize = tuSize;
1555
1.78M
                int scaleStride = stride;
1556
1.78M
                int costShift = 0;
1557
1558
1.78M
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1559
1560
                /* there are three cost tiers for intra modes:
1561
                *  pred[0]          - mode probable, least cost
1562
                *  pred[1], pred[2] - less probable, slightly more cost
1563
                *  non-mpm modes    - all cost the same (rbits) */
1564
1.78M
                uint64_t mpms;
1565
1.78M
                uint32_t mpmModes[3];
1566
1.78M
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1567
1568
1.78M
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1569
1.78M
                uint64_t modeCosts[35];
1570
1571
                // DC
1572
1.78M
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1573
1.78M
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
1574
1.78M
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1575
1.78M
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1576
1577
                // PLANAR
1578
1.78M
                pixel* planar = intraNeighbourBuf[0];
1579
1.78M
                if (tuSize >= 8 && tuSize <= 32)
1580
437k
                    planar = intraNeighbourBuf[1];
1581
1582
1.78M
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
1583
1.78M
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
1584
1.78M
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1585
1.78M
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1586
1.78M
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1587
1588
                // angular predictions
1589
1.78M
                if (primitives.cu[sizeIdx].intra_pred_allangs)
1590
0
                {
1591
0
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1592
0
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
1593
0
                    for (int mode = 2; mode < 35; mode++)
1594
0
                    {
1595
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1596
0
                        if (mode < 18)
1597
0
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1598
0
                        else
1599
0
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1600
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1601
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1602
0
                    }
1603
0
                }
1604
1.78M
                else
1605
1.78M
                {
1606
59.6M
                    for (int mode = 2; mode < 35; mode++)
1607
57.8M
                    {
1608
57.8M
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1609
57.8M
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
1610
57.8M
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
1611
57.8M
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
1612
57.8M
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1613
57.8M
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1614
57.8M
                    }
1615
1.78M
                }
1616
1617
                /* Find the top maxCandCount candidate modes with cost within 25% of best
1618
                * or among the most probable modes. maxCandCount is derived from the
1619
                * rdLevel and depth. In general we want to try more modes at slower RD
1620
                * levels and at higher depths */
1621
15.4M
                for (int i = 0; i < maxCandCount; i++)
1622
13.6M
                    candCostList[i] = MAX_INT64;
1623
1624
1.78M
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
1625
64.3M
                for (int mode = 0; mode < 35; mode++)
1626
62.5M
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
1627
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
1628
1.88M
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1629
1.78M
            }
1630
1631
            /* measure best candidates using simple RDO (no TU splits) */
1632
1.78M
            bcost = MAX_INT64;
1633
3.59M
            for (int i = 0; i < maxCandCount; i++)
1634
3.59M
            {
1635
3.59M
                if (candCostList[i] == MAX_INT64)
1636
1.78M
                    break;
1637
1638
1.81M
                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1639
1640
1.81M
                m_entropyCoder.load(m_rqt[depth].cur);
1641
1.81M
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1642
1643
1.81M
                Cost icosts;
1644
1.81M
                if (checkTransformSkip)
1645
0
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1646
1.81M
                else
1647
1.81M
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1648
1.81M
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1649
1.81M
            }
1650
1.78M
        }
1651
1652
1.78M
        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1653
1654
        /* remeasure best mode, allowing TU splits */
1655
1.78M
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
1656
1.78M
        m_entropyCoder.load(m_rqt[depth].cur);
1657
1658
1.78M
        Cost icosts;
1659
1.78M
        if (checkTransformSkip)
1660
0
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1661
1.78M
        else
1662
1.78M
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
1663
1.78M
        totalDistortion += icosts.distortion;
1664
1665
1.78M
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
1666
1667
        // set reconstruction for next intra prediction blocks
1668
1.78M
        if (puIdx != numPU - 1)
1669
1.01M
        {
1670
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
1671
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1672
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1673
             * that the contexts should be tracked through each PU */
1674
1.01M
            PicYuv*  reconPic = m_frame->m_reconPic;
1675
1.01M
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1676
1.01M
            uint32_t dststride = reconPic->m_stride;
1677
1.01M
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
1678
1.01M
            uint32_t srcstride = reconYuv->m_size;
1679
1.01M
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
1680
1.01M
        }
1681
1.78M
    }
1682
1683
775k
    if (numPU > 1)
1684
338k
    {
1685
338k
        uint32_t combCbfY = 0;
1686
1.69M
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1687
1.35M
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
1688
1689
338k
        cu.m_cbf[0][0] |= combCbfY;
1690
338k
    }
1691
1692
    // TODO: remove this
1693
775k
    m_entropyCoder.load(m_rqt[depth].cur);
1694
1695
775k
    return totalDistortion;
1696
775k
}
1697
1698
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
1699
0
{
1700
0
    CUData& cu = intraMode.cu;
1701
0
    const Yuv* fencYuv = intraMode.fencYuv;
1702
0
    Yuv* predYuv = &intraMode.predYuv;
1703
1704
0
    uint32_t bestMode  = 0;
1705
0
    uint64_t bestCost  = MAX_INT64;
1706
0
    uint32_t modeList[NUM_CHROMA_MODE];
1707
1708
0
    uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
1709
0
    uint32_t tuSize = 1 << log2TrSizeC;
1710
0
    uint32_t tuDepth = 0;
1711
0
    int32_t costShift = 0;
1712
1713
0
    if (tuSize > 32)
1714
0
    {
1715
0
        tuDepth = 1;
1716
0
        costShift = 2;
1717
0
        log2TrSizeC = 5;
1718
0
    }
1719
1720
0
    IntraNeighbors intraNeighbors;
1721
0
    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
1722
0
    cu.getAllowedChromaDir(0, modeList);
1723
1724
    // check chroma modes
1725
0
    for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
1726
0
    {
1727
0
        uint32_t chromaPredMode = modeList[mode];
1728
0
        if (chromaPredMode == DM_CHROMA_IDX)
1729
0
            chromaPredMode = cu.m_lumaIntraDir[0];
1730
0
        if (m_csp == X265_CSP_I422)
1731
0
            chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1732
1733
0
        uint64_t cost = 0;
1734
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1735
0
        {
1736
0
            const pixel* fenc = fencYuv->m_buf[chromaId];
1737
0
            pixel* pred = predYuv->m_buf[chromaId];
1738
0
            Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
1739
            // get prediction signal
1740
0
            predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
1741
0
            cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
1742
0
        }
1743
1744
0
        if (cost < bestCost)
1745
0
        {
1746
0
            bestCost = cost;
1747
0
            bestMode = modeList[mode];
1748
0
        }
1749
0
    }
1750
1751
0
    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
1752
0
}
1753
1754
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
1755
775k
{
1756
775k
    CUData& cu = intraMode.cu;
1757
775k
    Yuv& reconYuv = intraMode.reconYuv;
1758
1759
775k
    uint32_t depth       = cuGeom.depth;
1760
775k
    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
1761
775k
    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
1762
775k
    uint32_t absPartStep = cuGeom.numPartitions;
1763
775k
    sse_t totalDistortion = 0;
1764
1765
775k
    int size = partitionFromLog2Size(log2TrSize);
1766
1767
775k
    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
1768
1769
775k
    do
1770
775k
    {
1771
775k
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1772
1773
775k
        uint32_t bestMode = 0;
1774
775k
        sse_t bestDist = 0;
1775
775k
        uint64_t bestCost = MAX_INT64;
1776
1777
        // init mode list
1778
775k
        uint32_t minMode = 0;
1779
775k
        uint32_t maxMode = NUM_CHROMA_MODE;
1780
775k
        uint32_t modeList[NUM_CHROMA_MODE];
1781
1782
775k
        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
1783
0
        {
1784
0
            for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
1785
0
                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
1786
0
            maxMode = 1;
1787
0
        }
1788
775k
        else
1789
775k
            cu.getAllowedChromaDir(absPartIdxC, modeList);
1790
1791
775k
        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
1792
0
        {
1793
0
            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
1794
0
                modeList[l] = modeList[0];
1795
0
            maxMode = 1;
1796
0
        }
1797
        // check chroma modes
1798
4.64M
        for (uint32_t mode = minMode; mode < maxMode; mode++)
1799
3.86M
        {
1800
            // restore context models
1801
3.86M
            m_entropyCoder.load(m_rqt[depth].cur);
1802
1803
3.86M
            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
1804
3.86M
            Cost outCost;
1805
3.86M
            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
1806
1807
3.86M
            if (m_slice->m_pps->bTransformSkipEnabled)
1808
0
                m_entropyCoder.load(m_rqt[depth].cur);
1809
1810
3.86M
            m_entropyCoder.resetBits();
1811
            // chroma prediction mode
1812
3.86M
            if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
1813
3.87M
            {
1814
3.87M
                if (!absPartIdxC)
1815
3.87M
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1816
3.87M
            }
1817
18.4E
            else
1818
18.4E
            {
1819
18.4E
                uint32_t qNumParts = cuGeom.numPartitions >> 2;
1820
18.4E
                if (!(absPartIdxC & (qNumParts - 1)))
1821
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1822
18.4E
            }
1823
1824
3.86M
            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
1825
3.86M
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
1826
3.86M
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
1827
3.86M
            uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1828
18.4E
            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
1829
18.4E
                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
1830
1831
3.86M
            if (cost < bestCost)
1832
1.18M
            {
1833
1.18M
                bestCost = cost;
1834
1.18M
                bestDist = outCost.distortion;
1835
1.18M
                bestMode = modeList[mode];
1836
1.18M
                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
1837
1.18M
                memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1838
1.18M
                memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1839
1.18M
                memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1840
1.18M
                memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1841
1.18M
            }
1842
3.86M
        }
1843
1844
775k
        if (!tuIterator.isLastSection())
1845
0
        {
1846
0
            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
1847
0
            PicYuv*  reconPic  = m_frame->m_reconPic;
1848
0
            uint32_t dststride = reconPic->m_strideC;
1849
0
            const pixel* src;
1850
0
            pixel* dst;
1851
1852
0
            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
1853
0
            src = reconYuv.getCbAddr(absPartIdxC);
1854
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
1855
1856
0
            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
1857
0
            src = reconYuv.getCrAddr(absPartIdxC);
1858
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
1859
0
        }
1860
1861
775k
        memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1862
775k
        memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1863
775k
        memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1864
775k
        memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1865
775k
        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
1866
775k
        totalDistortion += bestDist;
1867
775k
    }
1868
775k
    while (tuIterator.isNextSection());
1869
1870
775k
    if (initTuDepth != 0)
1871
0
    {
1872
0
        uint32_t combCbfU = 0;
1873
0
        uint32_t combCbfV = 0;
1874
0
        uint32_t qNumParts = tuIterator.absPartIdxStep;
1875
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1876
0
        {
1877
0
            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
1878
0
            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
1879
0
        }
1880
1881
0
        cu.m_cbf[1][0] |= combCbfU;
1882
0
        cu.m_cbf[2][0] |= combCbfV;
1883
0
    }
1884
1885
    /* TODO: remove this */
1886
775k
    m_entropyCoder.load(m_rqt[depth].cur);
1887
775k
    return totalDistortion;
1888
775k
}
1889
1890
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
1891
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
1892
0
{
1893
0
    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
1894
1895
0
    MVField  candMvField[MRG_MAX_NUM_CANDS][2];
1896
0
    uint8_t  candDir[MRG_MAX_NUM_CANDS];
1897
0
    uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
1898
1899
0
    if (cu.isBipredRestriction())
1900
0
    {
1901
        /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
1902
0
        for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
1903
0
        {
1904
0
            if (candDir[mergeCand] == 3)
1905
0
            {
1906
0
                candDir[mergeCand] = 1;
1907
0
                candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
1908
0
            }
1909
0
        }
1910
0
    }
1911
1912
0
    Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1913
1914
0
    uint32_t outCost = MAX_UINT;
1915
0
    for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
1916
0
    {
1917
        /* Prevent TMVP candidates from using unavailable reference pixels */
1918
0
        if (m_bFrameParallel)
1919
0
        {
1920
            // Parallel slices bound check
1921
0
            if (m_param->maxSlices > 1)
1922
0
            {
1923
0
                if (cu.m_bFirstRowInSlice &
1924
0
                    ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
1925
0
                    continue;
1926
1927
                // Last row in slice can't reference beyond bound since it is another slice area
1928
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
1929
0
                if (cu.m_bLastRowInSlice &&
1930
0
                    ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
1931
0
                    continue;
1932
0
            }
1933
1934
0
            if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1935
0
                candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
1936
0
                continue;
1937
0
        }
1938
1939
0
        cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
1940
0
        cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
1941
0
        cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
1942
0
        cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
1943
1944
0
        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
1945
1946
0
        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
1947
0
        if (m_me.bChromaSATD)
1948
0
            costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
1949
1950
0
        uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
1951
0
        costCand = costCand + m_rdCost.getCost(bitsCand);
1952
0
        if (costCand < outCost)
1953
0
        {
1954
0
            outCost = costCand;
1955
0
            m.bits = bitsCand;
1956
0
            m.index = mergeCand;
1957
0
        }
1958
0
    }
1959
1960
0
    m.mvField[0] = candMvField[m.index][0];
1961
0
    m.mvField[1] = candMvField[m.index][1];
1962
0
    m.dir = candDir[m.index];
1963
1964
0
    return outCost;
1965
0
}
1966
1967
/* find the lowres motion vector from lookahead in middle of current PU */
1968
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
1969
0
{
1970
0
    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
1971
0
    if (diffPoc > m_param->bframes + 1)
1972
        /* poc difference is out of range for lookahead */
1973
0
        return 0;
1974
1975
0
    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc];
1976
0
    if (mvs[0].x == 0x7FFF)
1977
        /* this motion search was not estimated by lookahead */
1978
0
        return 0;
1979
1980
0
    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
1981
0
    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
1982
0
    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
1983
1984
0
    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
1985
0
    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
1986
1987
0
    return mvs[idx] << 1; /* scale up lowres mv */
1988
0
}
1989
1990
/* Pick between the two AMVP candidates which is the best one to use as
1991
 * MVP for the motion search, based on SAD cost */
1992
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
1993
0
{
1994
0
    if (amvp[0] == amvp[1])
1995
0
        return 0;
1996
1997
0
    Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
1998
0
    uint32_t costs[AMVP_NUM_CANDS];
1999
2000
0
    for (int i = 0; i < AMVP_NUM_CANDS; i++)
2001
0
    {
2002
0
        MV mvCand = amvp[i];
2003
2004
        // NOTE: skip mvCand if Y is > merange and -FN>1
2005
0
        if (m_bFrameParallel)
2006
0
        {
2007
0
            costs[i] = m_me.COST_MAX;
2008
2009
0
            if (mvCand.y >= (m_param->searchRange + 1) * 4)
2010
0
                continue;
2011
2012
0
            if ((m_param->maxSlices > 1) &
2013
0
                ((mvCand.y < m_sliceMinY)
2014
0
              |  (mvCand.y > m_sliceMaxY)))
2015
0
                continue;
2016
0
        }
2017
0
        cu.clipMv(mvCand);
2018
0
        predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
2019
0
        costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
2020
0
    }
2021
2022
0
    return (costs[0] <= costs[1]) ? 0 : 1;
2023
0
}
2024
2025
void Search::PME::processTasks(int workerThreadId)
2026
0
{
2027
#if DETAILED_CU_STATS
2028
    int fe = mode.cu.m_encData->m_frameEncoderID;
2029
    master.m_stats[fe].countPMETasks++;
2030
    ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
2031
#endif
2032
0
    ProfileScopeEvent(pme);
2033
0
    master.processPME(*this, master.m_tld[workerThreadId].analysis);
2034
0
}
2035
2036
void Search::processPME(PME& pme, Search& slave)
2037
0
{
2038
    /* acquire a motion estimation job, else exit early */
2039
0
    int meId;
2040
0
    pme.m_lock.acquire();
2041
0
    if (pme.m_jobTotal > pme.m_jobAcquired)
2042
0
    {
2043
0
        meId = pme.m_jobAcquired++;
2044
0
        pme.m_lock.release();
2045
0
    }
2046
0
    else
2047
0
    {
2048
0
        pme.m_lock.release();
2049
0
        return;
2050
0
    }
2051
2052
    /* Setup slave Search instance for ME for master's CU */
2053
0
    if (&slave != this)
2054
0
    {
2055
0
        slave.m_slice = m_slice;
2056
0
        slave.m_frame = m_frame;
2057
0
        slave.m_param = m_param;
2058
0
        slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
2059
0
        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
2060
0
        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
2061
0
    }
2062
2063
    /* Perform ME, repeat until no more work is available */
2064
0
    do
2065
0
    {
2066
0
        if (meId < pme.m_jobs.refCnt[0])
2067
0
        {
2068
0
            int refIdx = pme.m_jobs.ref[0][meId]; //L0
2069
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
2070
0
        }
2071
0
        else
2072
0
        {
2073
0
            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
2074
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
2075
0
        }
2076
2077
0
        meId = -1;
2078
0
        pme.m_lock.acquire();
2079
0
        if (pme.m_jobTotal > pme.m_jobAcquired)
2080
0
            meId = pme.m_jobAcquired++;
2081
0
        pme.m_lock.release();
2082
0
    }
2083
0
    while (meId >= 0);
2084
0
}
2085
2086
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
2087
0
{
2088
0
    uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
2089
0
    bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
2090
2091
0
    MotionData* bestME = interMode.bestME[part];
2092
2093
    // 12 mv candidates including lowresMV
2094
0
    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2095
0
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2096
2097
0
    const MV* amvp = interMode.amvpCand[list][ref];
2098
0
    int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
2099
0
    bool bLowresMVP = false;
2100
0
    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2101
2102
0
    if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
2103
0
    {
2104
0
        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
2105
0
        if (lmv.notZero())
2106
0
            mvc[numMvc++] = lmv;
2107
0
        if (m_param->bEnableHME)
2108
0
            mvp_lowres = lmv;
2109
0
    }
2110
2111
0
    setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
2112
2113
0
    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
2114
0
      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2115
2116
0
    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2117
0
    {
2118
0
        MV outmv_lowres;
2119
0
        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2120
0
        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
2121
0
            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2122
0
        if (lowresMvCost < satdCost)
2123
0
        {
2124
0
            outmv = outmv_lowres;
2125
0
            satdCost = lowresMvCost;
2126
0
            bLowresMVP = true;
2127
0
        }
2128
0
    }
2129
    /* Get total cost of partition, but only include MV bit cost once */
2130
0
    bits += m_me.bitcost(outmv);
2131
0
    uint32_t mvCost = m_me.mvcost(outmv);
2132
0
    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2133
2134
    /* Update LowresMVP to best AMVP cand*/
2135
0
    if (bLowresMVP)
2136
0
        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2137
2138
    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2139
0
    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2140
2141
    /* tie goes to the smallest ref ID, just like --no-pme */
2142
0
    ScopedLock _lock(master.m_meLock);
2143
0
    if (cost < bestME[list].cost ||
2144
0
       (cost == bestME[list].cost && ref < bestME[list].ref))
2145
0
    {
2146
0
        bestME[list].mv = outmv;
2147
0
        bestME[list].mvp = mvp;
2148
0
        bestME[list].mvpIdx = mvpIdx;
2149
0
        bestME[list].ref = ref;
2150
0
        bestME[list].cost = cost;
2151
0
        bestME[list].bits = bits;
2152
0
        bestME[list].mvCost  = mvCost;
2153
0
    }
2154
0
}
2155
void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc)
2156
0
{
2157
0
    CUData& cu = interMode.cu;
2158
0
    MV mv, mvmin, mvmax;
2159
0
    int cand = 0, bestcost = INT_MAX;
2160
0
    while (cand < m_param->mvRefine)
2161
0
    {
2162
0
        if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1])))
2163
0
        {
2164
0
            cand++;
2165
0
            continue;
2166
0
        }
2167
0
        MV bestMV;
2168
0
        mv = mvp[cand++];
2169
0
        cu.clipMv(mv);
2170
0
        setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
2171
0
        int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices,
2172
0
        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2173
0
        if (bestcost > cost)
2174
0
        {
2175
0
            bestcost = cost;
2176
0
            outmv = bestMV;
2177
0
        }
2178
0
    }
2179
0
}
2180
/* find the best inter prediction for each PU of specified mode */
2181
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
2182
0
{
2183
0
    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
2184
2185
0
    CUData& cu = interMode.cu;
2186
0
    Yuv* predYuv = &interMode.predYuv;
2187
2188
    // 12 mv candidates including lowresMV
2189
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2190
2191
0
    const Slice *slice = m_slice;
2192
0
    int numPart     = cu.getNumPartInter(0);
2193
0
    int numPredDir  = slice->isInterP() ? 1 : 2;
2194
0
    const int* numRefIdx = slice->m_numRefIdx;
2195
0
    uint32_t lastMode = 0;
2196
0
    int      totalmebits = 0;
2197
0
    MV       mvzero(0, 0);
2198
0
    Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2199
0
    MergeData merge;
2200
0
    memset(&merge, 0, sizeof(merge));
2201
0
    bool useAsMVP = false;
2202
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
2203
0
    {
2204
0
        MotionData* bestME = interMode.bestME[puIdx];
2205
0
        PredictionUnit pu(cu, cuGeom, puIdx);
2206
0
        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
2207
0
        useAsMVP = false;
2208
0
        x265_analysis_inter_data* interDataCTU = NULL;
2209
0
        int cuIdx;
2210
0
        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
2211
0
        if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1)
2212
0
        {
2213
0
            interDataCTU = m_frame->m_analysisData.interData;
2214
0
            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
2215
0
                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
2216
0
                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
2217
0
                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
2218
0
                useAsMVP = true;
2219
0
        }
2220
        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
2221
0
        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
2222
0
        bestME[0].cost = MAX_UINT;
2223
0
        bestME[1].cost = MAX_UINT;
2224
2225
0
        getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
2226
0
        bool bDoUnidir = true;
2227
2228
0
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
2229
        /* Uni-directional prediction */
2230
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2231
0
            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
2232
0
        {
2233
0
            for (int list = 0; list < numPredDir; list++)
2234
0
            {
2235
2236
0
                int ref = -1;
2237
0
                if (useAsMVP)
2238
0
                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
2239
0
                else
2240
0
                    ref = bestME[list].ref;
2241
0
                if (ref < 0)
2242
0
                {
2243
0
                    continue;
2244
0
                }
2245
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2246
0
                bits += getTUBits(ref, numRefIdx[list]);
2247
2248
0
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2249
0
                const MV* amvp = interMode.amvpCand[list][ref];
2250
0
                int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2251
0
                MV mvmin, mvmax, outmv, mvp;
2252
0
                if (useAsMVP)
2253
0
                {
2254
0
                    mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
2255
0
                    mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
2256
0
                }
2257
0
                else
2258
0
                    mvp = amvp[mvpIdx];
2259
0
                if (m_param->searchMethod == X265_SEA)
2260
0
                {
2261
0
                    int puX = puIdx & 1;
2262
0
                    int puY = puIdx >> 1;
2263
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2264
0
                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
2265
0
                }
2266
0
                setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2267
0
                MV mvpIn = mvp;
2268
0
                int satdCost;
2269
0
                if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
2270
0
                    mvpIn = bestME[list].mv;
2271
0
                if (useAsMVP && m_param->mvRefine > 1)
2272
0
                {
2273
0
                    MV bestmv, mvpSel[3];
2274
0
                    int mvpIdxSel[3];
2275
0
                    satdCost = m_me.COST_MAX;
2276
0
                    mvpSel[0] = mvp;
2277
0
                    mvpIdxSel[0] = mvpIdx;
2278
0
                    mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2279
0
                    mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
2280
0
                    mvpIdxSel[1] = mvpIdx;
2281
0
                    if (m_param->mvRefine > 2)
2282
0
                    {
2283
0
                        mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
2284
0
                        mvpIdxSel[2] = !mvpIdx;
2285
0
                    }
2286
0
                    for (int cand = 0; cand < m_param->mvRefine; cand++)
2287
0
                    {
2288
0
                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
2289
0
                            continue;
2290
0
                        setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);
2291
0
                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices,
2292
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2293
0
                        if (satdCost > bcost)
2294
0
                        {
2295
0
                            satdCost = bcost;
2296
0
                            outmv = bestmv;
2297
0
                            mvp = mvpSel[cand];
2298
0
                            mvpIdx = mvpIdxSel[cand];
2299
0
                        }
2300
0
                    }
2301
0
                    mvpIn = mvp;
2302
0
                }
2303
0
                else
2304
0
                {
2305
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
2306
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2307
0
                }
2308
2309
                /* Get total cost of partition, but only include MV bit cost once */
2310
0
                bits += m_me.bitcost(outmv);
2311
0
                uint32_t mvCost = m_me.mvcost(outmv);
2312
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2313
                /* Refine MVP selection, updates: mvpIdx, bits, cost */
2314
0
                if (!(m_param->analysisMultiPassRefine || useAsMVP))
2315
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2316
0
                else
2317
0
                {
2318
                    /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here 
2319
                      the actual mvp is bestME from pass 1 for that mvpIdx */
2320
0
                    int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
2321
0
                    if (diffBits < 0)
2322
0
                    {
2323
0
                        mvpIdx = !mvpIdx;
2324
0
                        uint32_t origOutBits = bits;
2325
0
                        bits = origOutBits + diffBits;
2326
0
                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
2327
0
                    }
2328
0
                    mvp = amvp[mvpIdx];
2329
0
                }
2330
2331
0
                if (cost < bestME[list].cost)
2332
0
                {
2333
0
                    bestME[list].mv = outmv;
2334
0
                    bestME[list].mvp = mvp;
2335
0
                    bestME[list].mvpIdx = mvpIdx;
2336
0
                    bestME[list].cost = cost;
2337
0
                    bestME[list].bits = bits;
2338
0
                    bestME[list].mvCost  = mvCost;
2339
0
                    bestME[list].ref = ref;
2340
0
                }
2341
0
                bDoUnidir = false;
2342
0
            }            
2343
0
        }
2344
0
        else if (m_param->bDistributeMotionEstimation)
2345
0
        {
2346
0
            PME pme(*this, interMode, cuGeom, pu, puIdx);
2347
0
            pme.m_jobTotal = 0;
2348
0
            pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
2349
2350
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2351
0
            for (int list = 0; list < numPredDir; list++)
2352
0
            {
2353
0
                int idx = 0;
2354
0
                for (int ref = 0; ref < numRefIdx[list]; ref++)
2355
0
                {
2356
0
                    if (!(refMask & (1 << ref)))
2357
0
                        continue;
2358
2359
0
                    pme.m_jobs.ref[list][idx++]  = ref;
2360
0
                    pme.m_jobTotal++;
2361
0
                }
2362
0
                pme.m_jobs.refCnt[list] = idx;
2363
2364
                /* the second list ref bits start at bit 16 */
2365
0
                refMask >>= 16;
2366
0
            }
2367
2368
0
            if (pme.m_jobTotal > 2)
2369
0
            {
2370
0
                pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
2371
2372
0
                processPME(pme, *this);
2373
2374
0
                int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
2375
0
                singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
2376
2377
0
                bDoUnidir = false;
2378
2379
0
                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
2380
0
                pme.waitForExit();
2381
0
            }
2382
2383
            /* if no peer threads were bonded, fall back to doing unidirectional
2384
             * searches ourselves without overhead of singleMotionEstimation() */
2385
0
        }
2386
0
        if (bDoUnidir)
2387
0
        {
2388
0
            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
2389
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2390
2391
0
            for (int list = 0; list < numPredDir; list++)
2392
0
            {
2393
0
                for (int ref = 0; ref < numRefIdx[list]; ref++)
2394
0
                {
2395
0
                    ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
2396
2397
0
                    if (!(refMask & (1 << ref)))
2398
0
                    {
2399
0
                        ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
2400
0
                        continue;
2401
0
                    }
2402
2403
0
                    uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2404
0
                    bits += getTUBits(ref, numRefIdx[list]);
2405
2406
0
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2407
2408
0
                    const MV* amvp = interMode.amvpCand[list][ref];
2409
0
                    int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2410
0
                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2411
0
                    bool bLowresMVP = false;
2412
2413
0
                    if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */
2414
0
                    {
2415
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
2416
0
                        if (lmv.notZero())
2417
0
                            mvc[numMvc++] = lmv;
2418
0
                        if (m_param->bEnableHME)
2419
0
                            mvp_lowres = lmv;
2420
0
                    }
2421
0
                    if (m_param->searchMethod == X265_SEA)
2422
0
                    {
2423
0
                        int puX = puIdx & 1;
2424
0
                        int puY = puIdx >> 1;
2425
0
                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2426
0
                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
2427
0
                    }
2428
0
                    setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2429
0
                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
2430
0
                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2431
2432
0
                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2433
0
                    {
2434
0
                        MV outmv_lowres;
2435
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2436
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
2437
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2438
0
                        if (lowresMvCost < satdCost)
2439
0
                        {
2440
0
                            outmv = outmv_lowres;
2441
0
                            satdCost = lowresMvCost;
2442
0
                            bLowresMVP = true;
2443
0
                        }
2444
0
                    }
2445
2446
                    /* Get total cost of partition, but only include MV bit cost once */
2447
0
                    bits += m_me.bitcost(outmv);
2448
0
                    uint32_t mvCost = m_me.mvcost(outmv);
2449
0
                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2450
                    /* Update LowresMVP to best AMVP cand*/
2451
0
                    if (bLowresMVP)
2452
0
                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2453
2454
                    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2455
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2456
2457
0
                    if (cost < bestME[list].cost)
2458
0
                    {
2459
0
                        bestME[list].mv      = outmv;
2460
0
                        bestME[list].mvp     = mvp;
2461
0
                        bestME[list].mvpIdx  = mvpIdx;
2462
0
                        bestME[list].ref     = ref;
2463
0
                        bestME[list].cost    = cost;
2464
0
                        bestME[list].bits    = bits;
2465
0
                        bestME[list].mvCost  = mvCost;
2466
0
                    }
2467
0
                }
2468
                /* the second list ref bits start at bit 16 */
2469
0
                refMask >>= 16;
2470
0
            }
2471
0
        }
2472
2473
        /* Bi-directional prediction */
2474
0
        MotionData bidir[2];
2475
0
        uint32_t bidirCost = MAX_UINT;
2476
0
        int bidirBits = 0;
2477
2478
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
2479
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
2480
0
            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
2481
0
        {
2482
0
            bidir[0] = bestME[0];
2483
0
            bidir[1] = bestME[1];
2484
2485
0
            int satdCost;
2486
2487
0
            if (m_me.bChromaSATD)
2488
0
            {
2489
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
2490
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2491
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
2492
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2493
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
2494
2495
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2496
0
                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2497
0
            }
2498
0
            else
2499
0
            {
2500
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
2501
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
2502
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2503
2504
                /* Generate reference subpels */
2505
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
2506
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
2507
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
2508
0
                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
2509
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2510
0
            }
2511
2512
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2513
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2514
2515
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2516
0
            if (bTryZero)
2517
0
            {
2518
                /* Do not try zero MV if unidir motion predictors are beyond
2519
                 * valid search area */
2520
0
                MV mvmin, mvmax;
2521
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2522
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2523
0
                mvmax.y += 2; // there is some pad for subpel refine
2524
0
                mvmin <<= 2;
2525
0
                mvmax <<= 2;
2526
2527
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2528
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2529
0
            }
2530
0
            if (bTryZero)
2531
0
            {
2532
                /* coincident blocks of the two reference pictures */
2533
0
                if (m_me.bChromaSATD)
2534
0
                {
2535
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
2536
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2537
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
2538
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2539
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
2540
2541
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2542
0
                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2543
0
                }
2544
0
                else
2545
0
                {
2546
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2547
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2548
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
2549
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
2550
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2551
0
                }
2552
0
                MV mvp0 = bestME[0].mvp;
2553
0
                int mvpIdx0 = bestME[0].mvpIdx;
2554
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
2555
2556
0
                MV mvp1 = bestME[1].mvp;
2557
0
                int mvpIdx1 = bestME[1].mvpIdx;
2558
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
2559
2560
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
2561
2562
                /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2563
0
                mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
2564
0
                mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
2565
2566
0
                if (cost < bidirCost)
2567
0
                {
2568
0
                    bidir[0].mv = mvzero;
2569
0
                    bidir[1].mv = mvzero;
2570
0
                    bidir[0].mvp = mvp0;
2571
0
                    bidir[1].mvp = mvp1;
2572
0
                    bidir[0].mvpIdx = mvpIdx0;
2573
0
                    bidir[1].mvpIdx = mvpIdx1;
2574
0
                    bidirCost = cost;
2575
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2576
0
                }
2577
0
            }
2578
0
        }
2579
2580
        /* select best option and store into CU */
2581
0
        if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
2582
0
        {
2583
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
2584
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
2585
0
            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
2586
0
            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
2587
0
            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
2588
0
            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
2589
0
            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
2590
2591
0
            totalmebits += merge.bits;
2592
0
        }
2593
0
        else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
2594
0
        {
2595
0
            lastMode = 2;
2596
2597
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
2598
0
            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
2599
0
            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
2600
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
2601
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
2602
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
2603
2604
0
            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
2605
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
2606
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
2607
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
2608
2609
0
            totalmebits += bidirBits;
2610
0
        }
2611
0
        else if (bestME[0].cost <= bestME[1].cost)
2612
0
        {
2613
0
            lastMode = 0;
2614
2615
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
2616
0
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
2617
0
            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
2618
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
2619
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
2620
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
2621
2622
0
            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
2623
0
            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
2624
2625
0
            totalmebits += bestME[0].bits;
2626
0
        }
2627
0
        else
2628
0
        {
2629
0
            lastMode = 1;
2630
2631
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
2632
0
            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
2633
0
            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
2634
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
2635
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
2636
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
2637
2638
0
            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
2639
0
            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
2640
2641
0
            totalmebits += bestME[1].bits;
2642
0
        }
2643
2644
0
        motionCompensation(cu, pu, *predYuv, true, bChromaMC);
2645
0
    }
2646
0
    interMode.sa8dBits += totalmebits;
2647
0
}
2648
2649
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
2650
0
{
2651
0
    if (cuMode == SIZE_2Nx2N)
2652
0
    {
2653
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
2654
0
        blockBit[1] = 3;
2655
0
        blockBit[2] = 5;
2656
0
    }
2657
0
    else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
2658
0
    {
2659
0
        static const uint32_t listBits[2][3][3] =
2660
0
        {
2661
0
            { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2662
0
            { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2663
0
        };
2664
0
        if (bPSlice)
2665
0
        {
2666
0
            blockBit[0] = 3;
2667
0
            blockBit[1] = 0;
2668
0
            blockBit[2] = 0;
2669
0
        }
2670
0
        else
2671
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2672
0
    }
2673
0
    else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
2674
0
    {
2675
0
        static const uint32_t listBits[2][3][3] =
2676
0
        {
2677
0
            { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2678
0
            { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2679
0
        };
2680
0
        if (bPSlice)
2681
0
        {
2682
0
            blockBit[0] = 3;
2683
0
            blockBit[1] = 0;
2684
0
            blockBit[2] = 0;
2685
0
        }
2686
0
        else
2687
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2688
0
    }
2689
0
    else if (cuMode == SIZE_NxN)
2690
0
    {
2691
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
2692
0
        blockBit[1] = 3;
2693
0
        blockBit[2] = 5;
2694
0
    }
2695
0
    else
2696
0
    {
2697
0
        X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2698
0
    }
2699
0
}
2700
2701
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2702
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
2703
0
{
2704
0
    int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
2705
0
    if (diffBits < 0)
2706
0
    {
2707
0
        mvpIdx = !mvpIdx;
2708
0
        uint32_t origOutBits = outBits;
2709
0
        outBits = origOutBits + diffBits;
2710
0
        outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
2711
0
    }
2712
0
    return amvpCand[mvpIdx];
2713
0
}
2714
2715
/* Update to default MVP when using an alternative mvp */
2716
void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
2717
0
{
2718
0
    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
2719
0
    uint32_t origOutBits = outBits;
2720
0
    outBits = origOutBits + diffBits;
2721
0
    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
2722
0
}
2723
2724
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
2725
0
{
2726
0
    MV dist((int32_t)merange << 2, (int32_t)merange << 2);
2727
0
    mvmin = mvp - dist;
2728
0
    mvmax = mvp + dist;
2729
2730
0
    cu.clipMv(mvmin);
2731
0
    cu.clipMv(mvmax);
2732
2733
0
    if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2734
0
          cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
2735
0
          m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
2736
0
    {
2737
0
        int safeX, maxSafeMv;
2738
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2739
0
        maxSafeMv = (safeX - cu.m_cuPelX) * 4;
2740
0
        mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
2741
0
        mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
2742
0
    }
2743
2744
    // apply restrict on slices
2745
0
    if ((m_param->maxSlices > 1) & m_bFrameParallel)
2746
0
    {
2747
0
        mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
2748
0
        mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
2749
0
    }
2750
2751
    /* Clip search range to signaled maximum MV length.
2752
     * We do not support this VUI field being changed from the default */
2753
0
    const int maxMvLen = (1 << 15) - 1;
2754
0
    mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
2755
0
    mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
2756
0
    mvmax.x = X265_MIN(mvmax.x, maxMvLen);
2757
0
    mvmax.y = X265_MIN(mvmax.y, maxMvLen);
2758
2759
0
    mvmin >>= 2;
2760
0
    mvmax >>= 2;
2761
2762
    /* conditional clipping for frame parallelism */
2763
0
    mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels);
2764
0
    mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels);
2765
2766
    /* conditional clipping for negative mv range */
2767
0
    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
2768
0
}
2769
2770
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2771
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
2772
0
{
2773
0
    CUData& cu = interMode.cu;
2774
0
    Yuv* reconYuv = &interMode.reconYuv;
2775
0
    const Yuv* fencYuv = interMode.fencYuv;
2776
0
    Yuv* predYuv = &interMode.predYuv;
2777
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2778
0
    uint32_t depth  = cu.m_cuDepth[0];
2779
2780
    // No residual coding : SKIP mode
2781
2782
0
    cu.setPredModeSubParts(MODE_SKIP);
2783
0
    cu.clearCbf();
2784
0
    cu.setTUDepthSubParts(0, 0, depth);
2785
2786
0
    reconYuv->copyFromYuv(interMode.predYuv);
2787
2788
    // Luma
2789
0
    int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
2790
0
    interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2791
0
    interMode.distortion = interMode.lumaDistortion;
2792
    // Chroma
2793
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
2794
0
    {
2795
0
        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2796
0
        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2797
0
        interMode.distortion += interMode.chromaDistortion;
2798
0
    }
2799
0
    cu.m_distortion[0] = interMode.distortion;
2800
0
    m_entropyCoder.load(m_rqt[depth].cur);
2801
0
    m_entropyCoder.resetBits();
2802
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
2803
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2804
0
    m_entropyCoder.codeSkipFlag(cu, 0);
2805
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
2806
0
    m_entropyCoder.codeMergeIndex(cu, 0);
2807
0
    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
2808
0
    interMode.coeffBits = 0;
2809
0
    interMode.totalBits = interMode.mvBits + skipFlagBits;
2810
0
    if (m_rdCost.m_psyRd)
2811
0
        interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2812
0
    else if(m_rdCost.m_ssimRd)
2813
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
2814
2815
0
    interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2816
0
    updateModeCost(interMode);
2817
0
    m_entropyCoder.store(interMode.contexts);
2818
0
}
2819
2820
/* encode residual and calculate rate-distortion for a CU block.
2821
 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2822
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
2823
0
{
2824
0
    ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
2825
2826
0
    CUData& cu = interMode.cu;
2827
0
    Yuv* reconYuv = &interMode.reconYuv;
2828
0
    Yuv* predYuv = &interMode.predYuv;
2829
0
    uint32_t depth = cuGeom.depth;
2830
0
    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
2831
0
    const Yuv* fencYuv = interMode.fencYuv;
2832
2833
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2834
2835
0
    uint32_t log2CUSize = cuGeom.log2CUSize;
2836
0
    int sizeIdx = log2CUSize - 2;
2837
2838
0
    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
2839
2840
0
    uint32_t tuDepthRange[2];
2841
0
    cu.getInterTUQtDepthRange(tuDepthRange, 0);
2842
2843
0
    m_entropyCoder.load(m_rqt[depth].cur);
2844
2845
0
    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
2846
0
        m_maxTUDepth = -1;
2847
0
    else if (m_limitTU & X265_TU_LIMIT_BFS)
2848
0
        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
2849
2850
0
    Cost costs;
2851
0
    if (m_limitTU & X265_TU_LIMIT_NEIGH)
2852
0
    {
2853
        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
2854
0
        int32_t tempDepth = m_maxTUDepth;
2855
0
        if (m_maxTUDepth != -1)
2856
0
        {
2857
0
            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
2858
0
            uint32_t minSize = tuDepthRange[0];
2859
0
            uint32_t maxSize = tuDepthRange[1];
2860
0
            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
2861
0
            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
2862
0
        }
2863
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
2864
0
        m_maxTUDepth = tempDepth;
2865
0
    }
2866
0
    else
2867
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
2868
2869
0
    uint32_t tqBypass = cu.m_tqBypass[0];
2870
0
    if (!tqBypass)
2871
0
    {
2872
0
        sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2873
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
2874
0
        {
2875
0
            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
2876
0
            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
2877
0
        }
2878
2879
        /* Consider the RD cost of not signaling any residual */
2880
0
        m_entropyCoder.load(m_rqt[depth].cur);
2881
0
        m_entropyCoder.resetBits();
2882
0
        m_entropyCoder.codeQtRootCbfZero();
2883
0
        uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
2884
2885
0
        uint32_t cbf0Energy; uint64_t cbf0Cost;
2886
0
        if (m_rdCost.m_psyRd)
2887
0
        {
2888
0
            cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2889
0
            cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
2890
0
        }
2891
0
        else if(m_rdCost.m_ssimRd)
2892
0
        {
2893
0
            cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
2894
0
            cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
2895
0
        }
2896
0
        else
2897
0
            cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
2898
2899
0
        if (cbf0Cost < costs.rdcost)
2900
0
        {
2901
0
            cu.clearCbf();
2902
0
            cu.setTUDepthSubParts(0, 0, depth);
2903
0
        }
2904
0
    }
2905
2906
0
    if (cu.getQtRootCbf(0))
2907
0
        saveResidualQTData(cu, *resiYuv, 0, 0);
2908
2909
    /* calculate signal bits for inter/merge/skip coded CU */
2910
0
    m_entropyCoder.load(m_rqt[depth].cur);
2911
2912
0
    m_entropyCoder.resetBits();
2913
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
2914
0
        m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
2915
2916
0
    uint32_t coeffBits, bits, mvBits;
2917
0
    if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
2918
0
    {
2919
0
        cu.setPredModeSubParts(MODE_SKIP);
2920
2921
        /* Merge/Skip */
2922
0
        coeffBits = mvBits = 0;
2923
0
        m_entropyCoder.codeSkipFlag(cu, 0);
2924
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
2925
0
        m_entropyCoder.codeMergeIndex(cu, 0);
2926
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
2927
0
        bits = mvBits + skipFlagBits;
2928
0
    }
2929
0
    else
2930
0
    {
2931
0
        m_entropyCoder.codeSkipFlag(cu, 0);
2932
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
2933
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
2934
0
        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
2935
0
        m_entropyCoder.codePredInfo(cu, 0);
2936
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
2937
2938
0
        bool bCodeDQP = m_slice->m_pps->bUseDQP;
2939
0
        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
2940
0
        bits = m_entropyCoder.getNumberOfWrittenBits();
2941
2942
0
        coeffBits = bits - mvBits - skipFlagBits;
2943
0
    }
2944
2945
0
    m_entropyCoder.store(interMode.contexts);
2946
2947
0
    if (cu.getQtRootCbf(0))
2948
0
        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
2949
0
    else
2950
0
        reconYuv->copyFromYuv(*predYuv);
2951
2952
    // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2953
0
    sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2954
0
    interMode.distortion = bestLumaDist;
2955
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
2956
0
    {
2957
0
        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2958
0
        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2959
0
        interMode.chromaDistortion = bestChromaDist;
2960
0
        interMode.distortion += bestChromaDist;
2961
0
    }
2962
0
    if (m_rdCost.m_psyRd)
2963
0
        interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2964
0
    else if(m_rdCost.m_ssimRd)
2965
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
2966
2967
0
    interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2968
0
    interMode.totalBits = bits;
2969
0
    interMode.lumaDistortion = bestLumaDist;
2970
0
    interMode.coeffBits = coeffBits;
2971
0
    interMode.mvBits = mvBits;
2972
0
    cu.m_distortion[0] = interMode.distortion;
2973
0
    updateModeCost(interMode);
2974
0
    checkDQP(interMode, cuGeom);
2975
0
}
2976
2977
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
2978
0
{
2979
0
    uint32_t depth = cuGeom.depth + tuDepth;
2980
0
    CUData& cu = mode.cu;
2981
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
2982
2983
0
    bool bCheckFull = log2TrSize <= depthRange[1];
2984
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
2985
0
        bCheckFull = false;
2986
2987
0
    if (bCheckFull)
2988
0
    {
2989
        // code full block
2990
0
        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2991
0
        uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
2992
2993
0
        uint32_t tuDepthC = tuDepth;
2994
0
        if (log2TrSizeC < 2)
2995
0
        {
2996
0
            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
2997
0
            log2TrSizeC = 2;
2998
0
            tuDepthC--;
2999
0
            codeChroma &= !(absPartIdx & 3);
3000
0
        }
3001
3002
0
        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
3003
0
        uint32_t setCbf = 1 << tuDepth;
3004
3005
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
3006
0
        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
3007
3008
0
        uint32_t sizeIdx  = log2TrSize  - 2;
3009
3010
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3011
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
3012
3013
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
3014
0
        const Yuv* fencYuv = mode.fencYuv;
3015
3016
0
        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
3017
0
        uint32_t strideResiY = resiYuv.m_size;
3018
3019
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
3020
0
        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
3021
3022
0
        if (numSigY)
3023
0
        {
3024
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
3025
0
            cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
3026
0
        }
3027
0
        else
3028
0
        {
3029
0
            primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
3030
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
3031
0
        }
3032
3033
0
        if (codeChroma)
3034
0
        {
3035
0
            uint32_t sizeIdxC = log2TrSizeC - 2;
3036
0
            uint32_t strideResiC = resiYuv.m_csize;
3037
3038
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3039
0
            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
3040
0
            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
3041
0
            bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
3042
3043
0
            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
3044
0
            do
3045
0
            {
3046
0
                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
3047
0
                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
3048
3049
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
3050
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
3051
3052
0
                int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
3053
0
                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
3054
0
                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
3055
0
                if (numSigU)
3056
0
                {
3057
0
                    m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
3058
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
3059
0
                }
3060
0
                else
3061
0
                {
3062
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
3063
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
3064
0
                }
3065
3066
0
                int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
3067
0
                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
3068
0
                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
3069
0
                if (numSigV)
3070
0
                {
3071
0
                    m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
3072
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
3073
0
                }
3074
0
                else
3075
0
                {
3076
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
3077
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
3078
0
                }
3079
0
            }
3080
0
            while (tuIterator.isNextSection());
3081
3082
0
            if (splitIntoSubTUs)
3083
0
            {
3084
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
3085
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
3086
0
            }
3087
0
        }
3088
0
    }
3089
0
    else
3090
0
    {
3091
0
        X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
3092
3093
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3094
0
        uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
3095
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
3096
0
        {
3097
0
            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
3098
0
            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
3099
0
            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3100
0
            {
3101
0
                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
3102
0
                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
3103
0
            }
3104
0
        }
3105
0
        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
3106
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3107
0
        {
3108
0
            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
3109
0
            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
3110
0
        }
3111
0
    }
3112
0
}
3113
3114
uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
3115
0
{
3116
0
    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
3117
3118
0
    if (m_rdCost.m_psyRd)
3119
0
        return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
3120
0
    else if(m_rdCost.m_ssimRd)
3121
0
        return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
3122
0
    else
3123
0
        return m_rdCost.calcRdCost(dist, nullBits);
3124
0
}
3125
3126
bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
3127
0
{
3128
0
    CUData& cu = mode.cu;
3129
0
    uint32_t depth = cuGeom.depth + tuDepth;
3130
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
3131
3132
0
    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3133
0
    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
3134
0
    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
3135
0
    {
3136
0
        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
3137
0
        {
3138
0
            m_maxTUDepth = cu.m_tuDepth[0];
3139
            // Fetch maximum TU depth of first sub partition to limit recursion of others
3140
0
            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
3141
0
                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
3142
0
        }
3143
0
        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
3144
0
        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
3145
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3146
0
        {
3147
0
            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
3148
0
            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
3149
0
        }
3150
0
    }
3151
0
    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
3152
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3153
0
    {
3154
0
        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
3155
0
        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
3156
0
    }
3157
3158
    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
3159
    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
3160
    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
3161
    // at depth 0 (for example).
3162
0
    m_entropyCoder.load(m_rqt[depth].rqtRoot);
3163
0
    m_entropyCoder.resetBits();
3164
0
    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
3165
0
    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
3166
0
    splitCost.bits += splitCbfBits;
3167
3168
0
    if (m_rdCost.m_psyRd)
3169
0
        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
3170
0
    else if(m_rdCost.m_ssimRd)
3171
0
        splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
3172
0
    else
3173
0
        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
3174
        
3175
0
    return ycbf || ucbf || vcbf;
3176
0
}
3177
3178
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
3179
0
{
3180
0
    CUData& cu = mode.cu;
3181
0
    uint32_t depth = cuGeom.depth + tuDepth;
3182
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
3183
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
3184
3185
0
    bool bCheckSplit = log2TrSize > depthRange[0];
3186
0
    bool bCheckFull = log2TrSize <= depthRange[1];
3187
0
    bool bSaveTUData = false, bLoadTUData = false;
3188
0
    uint32_t idx = 0;
3189
3190
0
    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
3191
0
    {
3192
0
        if (bCheckSplit && bCheckFull && tuDepth)
3193
0
        {
3194
0
            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
3195
0
            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
3196
0
            idx = (depth - 1) * 4 + qIdx;
3197
0
            if (splitMore)
3198
0
            {
3199
0
                bLoadTUData = true;
3200
0
                bCheckFull = false;
3201
0
            }
3202
0
            else
3203
0
            {
3204
0
                bSaveTUData = true;
3205
0
                bCheckSplit = false;
3206
0
            }
3207
0
        }
3208
0
    }
3209
0
    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
3210
0
    {
3211
0
        if (bCheckSplit && m_maxTUDepth >= 0)
3212
0
        {
3213
0
            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
3214
0
            bCheckSplit = log2TrSize > log2MaxTrSize;
3215
0
        }
3216
0
    }
3217
3218
0
    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
3219
3220
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
3221
0
        bCheckFull = false;
3222
3223
0
    X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
3224
3225
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3226
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
3227
0
    uint32_t tuDepthC = tuDepth;
3228
0
    if (log2TrSizeC < 2)
3229
0
    {
3230
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
3231
0
        log2TrSizeC = 2;
3232
0
        tuDepthC--;
3233
0
        codeChroma &= !(absPartIdx & 3);
3234
0
    }
3235
3236
    // code full block
3237
0
    Cost fullCost;
3238
0
    fullCost.rdcost = MAX_INT64;
3239
3240
0
    uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
3241
0
    uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
3242
0
    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
3243
0
    sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
3244
0
    uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
3245
0
    uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
3246
0
    uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
3247
3248
0
    m_entropyCoder.store(m_rqt[depth].rqtRoot);
3249
3250
0
    uint32_t trSize = 1 << log2TrSize;
3251
0
    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
3252
0
    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
3253
0
    const Yuv* fencYuv = mode.fencYuv;
3254
3255
    // code full block
3256
0
    if (bCheckFull)
3257
0
    {
3258
0
        uint32_t trSizeC = 1 << log2TrSizeC;
3259
0
        int partSize = partitionFromLog2Size(log2TrSize);
3260
0
        int partSizeC = partitionFromLog2Size(log2TrSizeC);
3261
0
        const uint32_t qtLayer = log2TrSize - 2;
3262
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
3263
0
        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3264
3265
0
        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
3266
0
        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
3267
0
        bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
3268
3269
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3270
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
3271
3272
0
        if (bEnableRDOQ)
3273
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
3274
3275
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
3276
0
        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
3277
0
        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
3278
0
        cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
3279
3280
0
        m_entropyCoder.resetBits();
3281
3282
0
        if (bSplitPresentFlag && log2TrSize > depthRange[0])
3283
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
3284
3285
0
        if (cbfFlag[TEXT_LUMA][0])
3286
0
            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
3287
0
        singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
3288
3289
0
        X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
3290
3291
        //Assuming zero residual 
3292
0
        sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
3293
0
        uint32_t zeroEnergyY = 0;
3294
0
        if (m_rdCost.m_psyRd)
3295
0
            zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
3296
0
        else if(m_rdCost.m_ssimRd)
3297
0
            zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
3298
3299
0
        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
3300
0
        uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
3301
3302
0
        if (cbfFlag[TEXT_LUMA][0])
3303
0
        {
3304
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
3305
3306
            // non-zero cost calculation for luma - This is an approximation
3307
            // finally we have to encode correct cbf after comparing with null cost
3308
0
            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
3309
0
            bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
3310
0
            uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
3311
0
            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
3312
0
            bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
3313
0
            bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
3314
0
            primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
3315
3316
0
            const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
3317
0
            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
3318
0
            uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
3319
0
            if (m_rdCost.m_psyRd)
3320
0
            {
3321
0
                nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
3322
0
                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
3323
0
            }
3324
0
            else if(m_rdCost.m_ssimRd)
3325
0
            {
3326
0
                nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
3327
0
                singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
3328
0
            }
3329
0
            else
3330
0
                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
3331
3332
0
            if (cu.m_tqBypass[0])
3333
0
            {
3334
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
3335
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
3336
0
            }
3337
0
            else
3338
0
            {
3339
                // zero-cost calculation for luma. This is an approximation
3340
                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
3341
                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
3342
0
                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
3343
3344
0
                if (nullCostY < singleCostY)
3345
0
                {
3346
0
                    cbfFlag[TEXT_LUMA][0] = 0;
3347
0
                    singleBits[TEXT_LUMA][0] = 0;
3348
0
                    primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
3349
#if CHECKED_BUILD || _DEBUG
3350
                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
3351
                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
3352
#endif
3353
0
                    if (checkTransformSkipY)
3354
0
                        minCost[TEXT_LUMA][0] = nullCostY;
3355
0
                    singleDist[TEXT_LUMA][0] = zeroDistY;
3356
0
                    singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
3357
0
                }
3358
0
                else
3359
0
                {
3360
0
                    if (checkTransformSkipY)
3361
0
                        minCost[TEXT_LUMA][0] = singleCostY;
3362
0
                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
3363
0
                    singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
3364
0
                }
3365
0
            }
3366
0
        }
3367
0
        else
3368
0
        {
3369
0
            if (checkTransformSkipY)
3370
0
                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
3371
0
            primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
3372
0
            singleDist[TEXT_LUMA][0] = zeroDistY;
3373
0
            singleBits[TEXT_LUMA][0] = 0;
3374
0
            singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
3375
0
        }
3376
3377
0
        cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3378
3379
0
        if (codeChroma)
3380
0
        {
3381
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3382
0
            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
3383
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3384
0
            {
3385
0
                sse_t zeroDistC = 0;
3386
0
                uint32_t zeroEnergyC = 0;
3387
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
3388
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
3389
3390
0
                do
3391
0
                {
3392
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
3393
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
3394
3395
0
                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3396
3397
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
3398
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
3399
3400
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
3401
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
3402
0
                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
3403
0
                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
3404
3405
0
                    uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
3406
0
                    if (cbfFlag[chromaId][tuIterator.section])
3407
0
                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
3408
3409
0
                    singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
3410
3411
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
3412
0
                    zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
3413
3414
                    // Assuming zero residual 
3415
0
                    if (m_rdCost.m_psyRd)
3416
0
                        zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
3417
0
                    else if(m_rdCost.m_ssimRd)
3418
0
                        zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
3419
3420
0
                    if (cbfFlag[chromaId][tuIterator.section])
3421
0
                    {
3422
0
                        m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
3423
0
                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
3424
3425
                        // non-zero cost calculation for luma, same as luma - This is an approximation
3426
                        // finally we have to encode correct cbf after comparing with null cost
3427
0
                        pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
3428
0
                        uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
3429
0
                        bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
3430
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
3431
0
                        bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
3432
0
                        bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
3433
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
3434
0
                        sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
3435
0
                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
3436
0
                        uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
3437
0
                        if (m_rdCost.m_psyRd)
3438
0
                        {
3439
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
3440
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
3441
0
                        }
3442
0
                        else if(m_rdCost.m_ssimRd)
3443
0
                        {
3444
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
3445
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
3446
0
                        }
3447
0
                        else
3448
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
3449
3450
0
                        if (cu.m_tqBypass[0])
3451
0
                        {
3452
0
                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
3453
0
                            singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
3454
0
                        }
3455
0
                        else
3456
0
                        {
3457
                            //zero-cost calculation for chroma. This is an approximation
3458
0
                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
3459
3460
0
                            if (nullCostC < singleCostC)
3461
0
                            {
3462
0
                                cbfFlag[chromaId][tuIterator.section] = 0;
3463
0
                                singleBits[chromaId][tuIterator.section] = 0;
3464
0
                                primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
3465
#if CHECKED_BUILD || _DEBUG
3466
                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
3467
                                memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
3468
#endif
3469
0
                                if (checkTransformSkipC)
3470
0
                                    minCost[chromaId][tuIterator.section] = nullCostC;
3471
0
                                singleDist[chromaId][tuIterator.section] = zeroDistC;
3472
0
                                singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
3473
0
                            }
3474
0
                            else
3475
0
                            {
3476
0
                                if (checkTransformSkipC)
3477
0
                                    minCost[chromaId][tuIterator.section] = singleCostC;
3478
0
                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
3479
0
                                singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
3480
0
                            }
3481
0
                        }
3482
0
                    }
3483
0
                    else
3484
0
                    {
3485
0
                        if (checkTransformSkipC)
3486
0
                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
3487
0
                        primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
3488
0
                        singleBits[chromaId][tuIterator.section] = 0;
3489
0
                        singleDist[chromaId][tuIterator.section] = zeroDistC;
3490
0
                        singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
3491
0
                    }
3492
3493
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3494
0
                }
3495
0
                while (tuIterator.isNextSection());
3496
0
            }
3497
0
        }
3498
3499
0
        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
3500
0
        {
3501
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3502
0
            {
3503
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
3504
0
                do
3505
0
                {
3506
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
3507
0
                    cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3508
0
                }
3509
0
                while(tuIterator.isNextSection());
3510
0
            }
3511
0
        }
3512
0
        if (checkTransformSkipY)
3513
0
        {
3514
0
            sse_t nonZeroDistY = 0;
3515
0
            uint32_t nonZeroEnergyY = 0;
3516
0
            uint64_t singleCostY = MAX_INT64;
3517
3518
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
3519
3520
0
            cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
3521
3522
0
            if (bEnableRDOQ)
3523
0
                m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
3524
3525
0
            fenc = fencYuv->getLumaAddr(absPartIdx);
3526
0
            resi = resiYuv.getLumaAddr(absPartIdx);
3527
0
            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
3528
3529
0
            if (numSigTSkipY)
3530
0
            {
3531
0
                m_entropyCoder.resetBits();
3532
0
                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
3533
0
                m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
3534
0
                const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
3535
3536
0
                m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
3537
0
                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
3538
3539
0
                bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
3540
0
                primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
3541
0
                nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
3542
3543
0
                if (m_rdCost.m_psyRd)
3544
0
                {
3545
0
                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
3546
0
                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
3547
0
                }
3548
0
                else if(m_rdCost.m_ssimRd)
3549
0
                {
3550
0
                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
3551
0
                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
3552
0
                }
3553
0
                else
3554
0
                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
3555
0
            }
3556
3557
0
            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
3558
0
                cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
3559
0
            else
3560
0
            {
3561
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
3562
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
3563
0
                cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
3564
0
                bestTransformMode[TEXT_LUMA][0] = 1;
3565
0
                if (m_param->limitTU)
3566
0
                    numSig[TEXT_LUMA][0] = numSigTSkipY;
3567
0
                uint32_t numCoeffY = 1 << (log2TrSize << 1);
3568
0
                memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
3569
0
                primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
3570
0
            }
3571
3572
0
            cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3573
0
        }
3574
3575
0
        if (codeChroma && checkTransformSkipC)
3576
0
        {
3577
0
            sse_t nonZeroDistC = 0;
3578
0
            uint32_t nonZeroEnergyC = 0;
3579
0
            uint64_t singleCostC = MAX_INT64;
3580
0
            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
3581
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3582
3583
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
3584
3585
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3586
0
            {
3587
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
3588
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
3589
3590
0
                do
3591
0
                {
3592
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
3593
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
3594
3595
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
3596
3597
0
                    cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3598
3599
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
3600
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
3601
3602
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
3603
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
3604
0
                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
3605
3606
0
                    m_entropyCoder.resetBits();
3607
0
                    singleBits[chromaId][tuIterator.section] = 0;
3608
3609
0
                    if (numSigTSkipC)
3610
0
                    {
3611
0
                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
3612
0
                        m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
3613
0
                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
3614
3615
0
                        m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
3616
0
                                                log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
3617
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
3618
0
                        bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
3619
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
3620
0
                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
3621
0
                        if (m_rdCost.m_psyRd)
3622
0
                        {
3623
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
3624
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
3625
0
                        }
3626
0
                        else if(m_rdCost.m_ssimRd)
3627
0
                        {
3628
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
3629
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
3630
0
                        }
3631
0
                        else
3632
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
3633
0
                    }
3634
3635
0
                    if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
3636
0
                        cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3637
0
                    else
3638
0
                    {
3639
0
                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
3640
0
                        singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
3641
0
                        cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
3642
0
                        bestTransformMode[chromaId][tuIterator.section] = 1;
3643
0
                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
3644
0
                        memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
3645
0
                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
3646
0
                    }
3647
3648
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3649
0
                }
3650
0
                while (tuIterator.isNextSection());
3651
0
            }
3652
0
        }
3653
3654
        // Here we were encoding cbfs and coefficients, after calculating distortion above.
3655
        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
3656
        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
3657
        // make any difference. But bit confused whether I should load the original context as below.
3658
0
        m_entropyCoder.load(m_rqt[depth].rqtRoot);
3659
0
        m_entropyCoder.resetBits();
3660
3661
        //Encode cbf flags
3662
0
        if (codeChroma)
3663
0
        {
3664
0
            if (!splitIntoSubTUs)
3665
0
            {
3666
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
3667
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
3668
0
            }
3669
0
            else
3670
0
            {
3671
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
3672
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
3673
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
3674
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
3675
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
3676
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
3677
0
            }
3678
0
        }
3679
3680
0
        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
3681
3682
0
        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
3683
3684
0
        uint32_t coeffBits = 0;
3685
0
        coeffBits = singleBits[TEXT_LUMA][0];
3686
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
3687
0
        {
3688
0
            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
3689
0
            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
3690
0
        }
3691
3692
        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
3693
        // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
3694
        // four split block's individual cbf value. This is not known before analysis of four split blocks.
3695
        // For that reason, I am collecting individual coefficient bits only.
3696
0
        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
3697
3698
0
        fullCost.distortion += singleDist[TEXT_LUMA][0];
3699
0
        fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
3700
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
3701
0
        {
3702
0
            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
3703
0
            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
3704
0
        }
3705
3706
0
        if (m_rdCost.m_psyRd)
3707
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
3708
0
        else if(m_rdCost.m_ssimRd)
3709
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
3710
0
        else
3711
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
3712
3713
0
        if (m_param->limitTU && bCheckSplit)
3714
0
        {
3715
            // Stop recursion if the TU's energy level is minimal
3716
0
            uint32_t numCoeff = trSize * trSize;
3717
0
            if (cbfFlag[TEXT_LUMA][0] == 0)
3718
0
                bCheckSplit = false;
3719
0
            else if (numSig[TEXT_LUMA][0] < (numCoeff / 64))
3720
0
            {
3721
0
                uint32_t energy = 0;
3722
0
                for (uint32_t i = 0; i < numCoeff; i++)
3723
0
                    energy += abs(coeffCurY[i]);
3724
0
                if (energy == numSig[TEXT_LUMA][0])
3725
0
                    bCheckSplit = false;
3726
0
            }
3727
0
        }
3728
3729
0
        if (bSaveTUData)
3730
0
        {
3731
0
            for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
3732
0
            {
3733
0
                for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
3734
0
                {
3735
0
                    m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
3736
0
                    m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
3737
0
                }
3738
0
            }
3739
0
            m_cacheTU.cost[idx] = fullCost;
3740
0
            m_entropyCoder.store(m_cacheTU.rqtStore[idx]);
3741
0
        }
3742
0
    }
3743
0
    if (bLoadTUData)
3744
0
    {
3745
0
        for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
3746
0
        {
3747
0
            for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
3748
0
            {
3749
0
                bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part];
3750
0
                cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part];
3751
0
            }
3752
0
        }
3753
0
        fullCost = m_cacheTU.cost[idx];
3754
0
        m_entropyCoder.load(m_cacheTU.rqtStore[idx]);
3755
0
        bCheckFull = true;
3756
0
    }
3757
3758
    // code sub-blocks
3759
0
    if (bCheckSplit)
3760
0
    {
3761
0
        if (bCheckFull)
3762
0
        {
3763
0
            m_entropyCoder.store(m_rqt[depth].rqtTest);
3764
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
3765
0
        }
3766
3767
0
        Cost splitCost;
3768
0
        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
3769
0
        {
3770
            // Subdiv flag can be encoded at the start of analysis of split blocks.
3771
0
            m_entropyCoder.resetBits();
3772
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
3773
0
            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
3774
0
        }
3775
3776
0
        bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
3777
0
        if (yCbCrCbf || !bCheckFull)
3778
0
        {
3779
0
            if (splitCost.rdcost < fullCost.rdcost)
3780
0
            {
3781
0
                if (m_limitTU & X265_TU_LIMIT_BFS)
3782
0
                {
3783
0
                    uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
3784
0
                    bool nextSplit = nextlog2TrSize > depthRange[0];
3785
0
                    if (nextSplit)
3786
0
                    {
3787
0
                        m_entropyCoder.load(m_rqt[depth].rqtRoot);
3788
0
                        splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
3789
0
                        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
3790
0
                        {
3791
                            // Subdiv flag can be encoded at the start of analysis of split blocks.
3792
0
                            m_entropyCoder.resetBits();
3793
0
                            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
3794
0
                            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
3795
0
                        }
3796
0
                        splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1);
3797
0
                    }
3798
0
                }
3799
0
                outCosts.distortion += splitCost.distortion;
3800
0
                outCosts.rdcost     += splitCost.rdcost;
3801
0
                outCosts.bits       += splitCost.bits;
3802
0
                outCosts.energy     += splitCost.energy;
3803
0
                return;
3804
0
            }
3805
0
            else
3806
0
                outCosts.energy     += splitCost.energy;
3807
0
        }
3808
3809
0
        cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
3810
0
        if (codeChroma)
3811
0
        {
3812
0
            if (!splitIntoSubTUs)
3813
0
            {
3814
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
3815
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
3816
0
            }
3817
0
            else
3818
0
            {
3819
0
                uint32_t tuNumParts = absPartIdxStep >> 1;
3820
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
3821
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
3822
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
3823
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
3824
0
            }
3825
0
        }
3826
0
        X265_CHECK(bCheckFull, "check-full must be set\n");
3827
0
        m_entropyCoder.load(m_rqt[depth].rqtTest);
3828
0
    }
3829
3830
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3831
0
    cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3832
3833
0
    if (codeChroma)
3834
0
    {
3835
0
        if (!splitIntoSubTUs)
3836
0
        {
3837
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
3838
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
3839
0
        }
3840
0
        else
3841
0
        {
3842
0
            uint32_t tuNumParts = absPartIdxStep >> 1;
3843
3844
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
3845
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
3846
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
3847
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
3848
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
3849
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
3850
0
        }
3851
0
    }
3852
3853
0
    outCosts.distortion += fullCost.distortion;
3854
0
    outCosts.rdcost     += fullCost.rdcost;
3855
0
    outCosts.bits       += fullCost.bits;
3856
0
    outCosts.energy     += fullCost.energy;
3857
0
}
3858
3859
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
3860
0
{
3861
0
    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
3862
3863
0
    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
3864
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
3865
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3866
0
    {
3867
0
        if (!(log2TrSize - m_hChromaShift < 2))
3868
0
        {
3869
0
            uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
3870
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
3871
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
3872
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
3873
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
3874
0
        }
3875
0
    }
3876
3877
0
    if (!bSubdiv)
3878
0
    {
3879
0
        m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
3880
0
    }
3881
0
    else
3882
0
    {
3883
0
        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
3884
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3885
0
            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
3886
0
    }
3887
0
}
3888
3889
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
3890
0
{
3891
0
    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
3892
3893
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
3894
0
    {
3895
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3896
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3897
0
            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
3898
0
        return;
3899
0
    }
3900
3901
0
    const uint32_t qtLayer = log2TrSize - 2;
3902
3903
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3904
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
3905
0
    uint32_t tuDepthC = tuDepth;
3906
0
    if (log2TrSizeC < 2)
3907
0
    {
3908
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
3909
0
        log2TrSizeC = 2;
3910
0
        tuDepthC--;
3911
0
        codeChroma &= !(absPartIdx & 3);
3912
0
    }
3913
3914
0
    m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
3915
3916
0
    uint32_t numCoeffY = 1 << (log2TrSize * 2);
3917
0
    uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
3918
0
    coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3919
0
    coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
3920
0
    memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
3921
3922
0
    if (codeChroma)
3923
0
    {
3924
0
        m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
3925
3926
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
3927
0
        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3928
3929
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3930
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3931
0
        coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
3932
0
        coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
3933
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
3934
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
3935
0
    }
3936
0
}
3937
3938
/* returns the number of bits required to signal a non-most-probable mode.
3939
 * on return mpms contains bitmap of most probable modes */
3940
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
3941
1.78M
{
3942
1.78M
    cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
3943
3944
1.78M
    mpms = 0;
3945
7.15M
    for (int i = 0; i < 3; ++i)
3946
5.36M
        mpms |= ((uint64_t)1 << mpmModes[i]);
3947
3948
1.78M
    return m_entropyCoder.bitsIntraModeNonMPM();
3949
1.78M
}
3950
3951
/* swap the current mode/cost with the mode with the highest cost in the
3952
 * current candidate list, if its cost is better (maintain a top N list) */
3953
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
3954
1.88M
{
3955
1.88M
    uint32_t maxIndex = 0;
3956
1.88M
    uint64_t maxValue = 0;
3957
3958
16.2M
    for (int i = 0; i < maxCandCount; i++)
3959
14.3M
    {
3960
14.3M
        if (maxValue < candCostList[i])
3961
2.03M
        {
3962
2.03M
            maxValue = candCostList[i];
3963
2.03M
            maxIndex = i;
3964
2.03M
        }
3965
14.3M
    }
3966
3967
1.88M
    if (cost < maxValue)
3968
1.81M
    {
3969
1.81M
        candCostList[maxIndex] = cost;
3970
1.81M
        candModeList[maxIndex] = mode;
3971
1.81M
    }
3972
1.88M
}
3973
3974
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
3975
775k
{
3976
775k
    CUData& cu = mode.cu;
3977
775k
    if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
3978
17.5k
    {
3979
17.5k
        if (cu.getQtRootCbf(0))
3980
788
        {
3981
788
            if (m_param->rdLevel >= 3)
3982
788
            {
3983
788
                mode.contexts.resetBits();
3984
788
                mode.contexts.codeDeltaQP(cu, 0);
3985
788
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
3986
788
                mode.totalBits += bits;
3987
788
                updateModeCost(mode);
3988
788
            }
3989
0
            else if (m_param->rdLevel <= 1)
3990
0
            {
3991
0
                mode.sa8dBits++;
3992
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
3993
0
            }
3994
0
            else
3995
0
            {
3996
0
                mode.totalBits++;
3997
0
                updateModeCost(mode);
3998
0
            }
3999
788
        }
4000
16.8k
        else
4001
16.8k
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
4002
17.5k
    }
4003
775k
}
4004
4005
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
4006
115k
{
4007
115k
    CUData& cu = mode.cu;
4008
4009
115k
    if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
4010
20.8k
    {
4011
20.8k
        bool hasResidual = false;
4012
4013
        /* Check if any sub-CU has a non-zero QP */
4014
1.03M
        for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
4015
1.01M
        {
4016
1.01M
            if (cu.getQtRootCbf(blkIdx))
4017
503
            {
4018
503
                hasResidual = true;
4019
503
                break;
4020
503
            }
4021
1.01M
        }
4022
20.8k
        if (hasResidual)
4023
503
        {
4024
503
            if (m_param->rdLevel >= 3)
4025
503
            {
4026
503
                mode.contexts.resetBits();
4027
503
                mode.contexts.codeDeltaQP(cu, 0);
4028
503
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
4029
503
                mode.totalBits += bits;
4030
503
                updateModeCost(mode);
4031
503
            }
4032
0
            else if (m_param->rdLevel <= 1)
4033
0
            {
4034
0
                mode.sa8dBits++;
4035
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
4036
0
            }
4037
0
            else
4038
0
            {
4039
0
                mode.totalBits++;
4040
0
                updateModeCost(mode);
4041
0
            }
4042
            /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
4043
            When the non-zero CBF sub-CU is found, stop */
4044
503
            cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
4045
503
        }
4046
20.3k
        else
4047
            /* No residual within this CU or subCU, so reset QP to RefQP */
4048
20.3k
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
4049
20.8k
    }
4050
115k
}