Coverage Report

Created: 2025-07-23 08:18

/src/x265/source/encoder/analysis.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
*          Steve Borho <steve@borho.org>
6
*          Min Chen <chenm003@163.com>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at license @ x265.com.
24
*****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "picyuv.h"
30
#include "primitives.h"
31
#include "threading.h"
32
33
#include "analysis.h"
34
#include "rdcost.h"
35
#include "encoder.h"
36
37
using namespace X265_NS;
38
39
/* An explanation of rate distortion levels (--rd-level)
40
 *
41
 * rd-level 0 generates no recon per CU (NO RDO or Quant)
42
 *
43
 *   sa8d selection between merge / skip / inter / intra and split
44
 *   no recon pixels generated until CTU analysis is complete, requiring
45
 *   intra predictions to use source pixels
46
 *
47
 * rd-level 1 uses RDO for merge and skip, sa8d for all else
48
 *
49
 *   RDO selection between merge and skip
50
 *   sa8d selection between (merge/skip) / inter modes / intra and split
51
 *   intra prediction uses reconstructed pixels
52
 *
53
 * rd-level 2 uses RDO for merge/skip and split
54
 *
55
 *   RDO selection between merge and skip
56
 *   sa8d selection between (merge/skip) / inter modes / intra
57
 *   RDO split decisions
58
 *
59
 * rd-level 3 uses RDO for merge/skip/best inter/intra
60
 *
61
 *   RDO selection between merge and skip
62
 *   sa8d selection of best inter mode
63
 *   sa8d decisions include chroma residual cost
64
 *   RDO selection between (merge/skip) / best inter mode / intra / split
65
 *
66
 * rd-level 4 enables RDOQuant
67
 *   chroma residual cost included in satd decisions, including subpel refine
68
 *    (as a result of --subme 3 being used by preset slow)
69
 *
70
 * rd-level 5,6 does RDO for each inter mode
71
 */
72
73
Analysis::Analysis()
74
0
{
75
0
    m_reuseInterDataCTU = NULL;
76
0
    m_reuseRef = NULL;
77
0
    m_bHD = false;
78
0
    m_modeFlag[0] = false;
79
0
    m_modeFlag[1] = false;
80
0
    m_checkMergeAndSkipOnly[0] = false;
81
0
    m_checkMergeAndSkipOnly[1] = false;
82
0
    m_evaluateInter = 0;
83
0
}
84
85
bool Analysis::create(ThreadLocalData *tld)
86
0
{
87
0
    m_tld = tld;
88
0
    m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
89
90
0
    int costArrSize = 1;
91
0
    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
92
0
    for (uint32_t i = 1; i <= maxDQPDepth; i++)
93
0
        costArrSize += (1 << (i * 2));
94
0
    cacheCost = X265_MALLOC(uint64_t, costArrSize);
95
96
0
    int csp = m_param->internalCsp;
97
0
    uint32_t cuSize = m_param->maxCUSize;
98
99
0
    bool ok = true;
100
0
    for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1)
101
0
    {
102
0
        ModeDepth &md = m_modeDepth[depth];
103
0
        ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param);
104
0
        ok &= md.fencYuv.create(cuSize, csp);
105
0
        if (ok)
106
0
        {
107
0
            for (int j = 0; j < MAX_PRED_TYPES; j++)
108
0
            {
109
0
                md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j);
110
0
                ok &= md.pred[j].predYuv.create(cuSize, csp);
111
0
                ok &= md.pred[j].reconYuv.create(cuSize, csp);
112
0
                md.pred[j].fencYuv = &md.fencYuv;
113
0
            }
114
0
        }
115
0
    }
116
0
    if (m_param->sourceHeight >= 1080)
117
0
        m_bHD = true;
118
119
0
    return ok;
120
0
}
121
122
void Analysis::destroy()
123
0
{
124
0
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
125
0
    {
126
0
        m_modeDepth[i].cuMemPool.destroy();
127
0
        m_modeDepth[i].fencYuv.destroy();
128
129
0
        for (int j = 0; j < MAX_PRED_TYPES; j++)
130
0
        {
131
0
            m_modeDepth[i].pred[j].predYuv.destroy();
132
0
            m_modeDepth[i].pred[j].reconYuv.destroy();
133
0
        }
134
0
    }
135
0
    X265_FREE(cacheCost);
136
0
}
137
138
Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
139
0
{
140
0
    m_slice = ctu.m_slice;
141
0
    m_frame = &frame;
142
0
    m_bChromaSa8d = m_param->rdLevel >= 3;
143
0
    m_param = m_frame->m_param;
144
145
#if _DEBUG || CHECKED_BUILD
146
    invalidateContexts(0);
147
#endif
148
149
0
    int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
150
0
    ctu.setQPSubParts((int8_t)qp, 0, 0);
151
152
0
    m_rqt[0].cur.load(initialContext);
153
0
    ctu.m_meanQP = initialContext.m_meanQP;
154
0
    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
155
156
0
    if (m_param->bSsimRd)
157
0
        calculateNormFactor(ctu, qp);
158
159
0
    uint32_t numPartition = ctu.m_numPartitions;
160
0
    if (m_param->bCTUInfo && m_frame->m_ctuInfo && m_frame->m_ctuInfo[ctu.m_cuAddr])
161
0
    {
162
0
        x265_ctu_info_t* ctuTemp = m_frame->m_ctuInfo[ctu.m_cuAddr];
163
0
        int32_t depthIdx = 0;
164
0
        uint32_t maxNum8x8Partitions = 64;
165
0
        uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr];
166
0
        uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
167
0
        int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
168
0
        do
169
0
        {
170
0
            uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx];
171
0
            uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx));
172
0
            int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx];
173
0
            memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth);
174
0
            memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth);
175
0
            memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth);
176
0
            for (uint32_t l = 0; l < numPartition >> 2 * depth; l++)
177
0
                prevCtuInfoChangePtr[l] = prevCtuInfoChange;
178
0
            depthInfoPtr += ctu.m_numPartitions >> 2 * depth;
179
0
            contentInfoPtr += ctu.m_numPartitions >> 2 * depth;
180
0
            prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth;
181
0
            depthIdx++;
182
0
        } while (ctuTemp->ctuPartitions[depthIdx] != 0);
183
184
0
        m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
185
0
        m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
186
0
        memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition);
187
        //Calculate log2CUSize from depth
188
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
189
0
            ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
190
0
    }
191
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
192
0
    {
193
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
194
0
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
195
0
        for (int dir = 0; dir < numPredDir; dir++)
196
0
        {
197
0
            m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
198
0
            m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
199
0
        }
200
0
        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
201
0
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
202
0
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
203
0
    }
204
    
205
0
    int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel);
206
0
    if ((strlen(m_param->analysisSave) || strlen(m_param->analysisLoad)) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10)
207
0
    {
208
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
209
0
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
210
0
        if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) ||
211
0
            ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7)))
212
0
            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
213
0
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
214
0
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
215
0
        if (reuseLevel > 4)
216
0
        {
217
0
            m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
218
0
            m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
219
0
        }
220
0
        if (strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad))
221
0
            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
222
0
                m_reuseRef[i] = -1;
223
0
    }
224
0
    ProfileCUScope(ctu, totalCTUTime, totalCTUs);
225
226
#if  ENABLE_SCC_EXT
227
    memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs));
228
    memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv));
229
    m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0;
230
#endif
231
0
    if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx[0] == 1) && m_slice->m_refPOCList[0][0] == m_slice->m_poc))
232
0
    {
233
0
        x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
234
0
        if (m_param->analysisLoadReuseLevel > 1)
235
0
        {
236
0
            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
237
0
            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
238
0
            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
239
0
            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
240
0
        }
241
#if ENABLE_SCC_EXT
242
        compressIntraCU(ctu, cuGeom, qp, &m_ibc);
243
#else
244
0
        compressIntraCU(ctu, cuGeom, qp);
245
0
#endif
246
0
    }
247
0
    else
248
0
    {
249
0
        bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16));
250
0
        bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4);
251
0
        bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
252
0
        bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6;
253
254
0
        if (bCopyAnalysis)
255
0
        {
256
0
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
257
0
            int posCTU = ctu.m_cuAddr * numPartition;
258
0
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
259
0
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
260
0
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
261
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
262
0
                memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);
263
264
0
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
265
0
            {
266
0
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
267
0
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
268
0
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
269
0
            }
270
            //Calculate log2CUSize from depth
271
0
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
272
0
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
273
0
        }
274
275
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
276
0
            ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
277
0
            && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
278
0
            compressIntraCU(ctu, cuGeom, qp);
279
0
        else if (!m_param->rdLevel)
280
0
        {
281
            /* In RD Level 0/1, copy source pixels into the reconstructed block so
282
             * they are available for intra predictions */
283
0
            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic[0], ctu.m_cuAddr, 0);
284
285
0
            compressInterCU_rd0_4(ctu, cuGeom, qp);
286
287
            /* generate residual for entire CTU at once and copy to reconPic */
288
0
            encodeResidue(ctu, cuGeom);
289
0
        }
290
0
        else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) ||
291
0
                ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16))
292
0
        {
293
0
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
294
0
            int posCTU = ctu.m_cuAddr * numPartition;
295
0
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
296
0
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
297
0
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
298
0
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
299
0
            {
300
0
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
301
0
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
302
0
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
303
0
            }
304
            //Calculate log2CUSize from depth
305
0
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
306
0
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
307
308
0
            qprdRefine (ctu, cuGeom, qp, qp);
309
0
            return *m_modeDepth[0].bestMode;
310
0
        }
311
0
        else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
312
0
            compressInterCU_dist(ctu, cuGeom, qp);
313
0
        else if (m_param->rdLevel <= 4)
314
0
            compressInterCU_rd0_4(ctu, cuGeom, qp);
315
0
        else
316
#if ENABLE_SCC_EXT
317
            compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc);
318
#else
319
0
            compressInterCU_rd5_6(ctu, cuGeom, qp);
320
0
#endif
321
0
    }
322
323
0
    if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
324
0
        qprdRefine(ctu, cuGeom, qp, qp);
325
326
0
    if (m_param->csvLogLevel >= 2)
327
0
        collectPUStatistics(ctu, cuGeom);
328
329
0
    return *m_modeDepth[0].bestMode;
330
0
}
331
332
void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom)
333
0
{
334
0
    uint8_t depth = 0;
335
0
    uint8_t partSize = 0;
336
0
    for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
337
0
    {
338
0
        depth = ctu.m_cuDepth[absPartIdx];
339
0
        partSize = ctu.m_partSize[absPartIdx];
340
0
        uint32_t numPU = nbPartsTable[(int)partSize];
341
0
        int shift = 2 * (m_param->maxCUDepth + 1 - depth);
342
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
343
0
        {
344
0
            PredictionUnit pu(ctu, cuGeom, puIdx);
345
0
            int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx);
346
0
            int mode = 1;
347
0
            if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN)
348
0
                mode = 2;
349
0
            else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N)
350
0
                 mode = 3;
351
0
            if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP)
352
0
            {
353
0
                ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift;
354
0
                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
355
0
            }
356
0
            else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA)
357
0
            {
358
0
                if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN)
359
0
                {
360
0
                    ctu.m_encData->m_frameStats.cnt4x4++;
361
0
                    ctu.m_encData->m_frameStats.totalPu[4]++;
362
0
                }
363
0
                else
364
0
                {
365
0
                    ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift;
366
0
                    ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
367
0
                }
368
0
            }
369
0
            else if (mode == 3)
370
0
            {
371
0
                ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift;
372
0
                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
373
0
                break;
374
0
            }
375
0
            else
376
0
            {
377
0
                if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx])
378
0
                    ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
379
0
                else
380
0
                    ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
381
382
0
                ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode;
383
0
            }
384
0
        }
385
0
    }
386
0
}
387
388
int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
389
0
{
390
0
    float predDepth = 0;
391
0
    CUData* neighbourCU;
392
0
    uint8_t count = 0;
393
0
    int32_t maxTUDepth = -1;
394
0
    neighbourCU = &m_slice->m_refFrameList[0][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
395
0
    predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
396
0
    count++;
397
0
    if (m_slice->isInterB())
398
0
    {
399
0
        neighbourCU = &m_slice->m_refFrameList[1][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
400
0
        predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
401
0
        count++;
402
0
    }
403
0
    if (parentCTU.m_cuAbove)
404
0
    {
405
0
        predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
406
0
        count++;
407
0
        if (parentCTU.m_cuAboveLeft)
408
0
        {
409
0
            predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
410
0
            count++;
411
0
        }
412
0
        if (parentCTU.m_cuAboveRight)
413
0
        {
414
0
            predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
415
0
            count++;
416
0
        }
417
0
    }
418
0
    if (parentCTU.m_cuLeft)
419
0
    {
420
0
        predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
421
0
        count++;
422
0
    }
423
0
    predDepth /= count;
424
425
0
    if (predDepth == 0)
426
0
        maxTUDepth = 0;
427
0
    else if (predDepth < 1)
428
0
        maxTUDepth = 1;
429
0
    else if (predDepth >= 1 && predDepth <= 1.5)
430
0
        maxTUDepth = 2;
431
0
    else if (predDepth > 1.5 && predDepth <= 2.5)
432
0
        maxTUDepth = 3;
433
0
    else
434
0
        maxTUDepth = -1;
435
436
0
    return maxTUDepth;
437
0
}
438
439
void Analysis::tryLossless(const CUGeom& cuGeom)
440
0
{
441
0
    ModeDepth& md = m_modeDepth[cuGeom.depth];
442
443
0
    if (!md.bestMode->distortion)
444
        /* already lossless */
445
0
        return;
446
0
    else if (md.bestMode->cu.isIntra(0))
447
0
    {
448
0
        md.pred[PRED_LOSSLESS].initCosts();
449
0
        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
450
0
        PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
451
0
        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
452
0
        checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
453
0
    }
454
0
    else
455
0
    {
456
0
        md.pred[PRED_LOSSLESS].initCosts();
457
0
        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
458
0
        md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
459
0
        encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
460
0
        checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
461
0
    }
462
0
}
463
464
void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
465
0
{
466
0
    uint32_t depth = cuGeom.depth;
467
0
    ModeDepth& md = m_modeDepth[depth];
468
0
    md.bestMode = NULL;
469
470
0
    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
471
472
0
    int bestCUQP = qp;
473
0
    int lambdaQP = lqp;
474
0
    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
475
0
    if (m_param->analysisLoadReuseLevel >= 7)
476
0
        doQPRefine = false;
477
0
    if (doQPRefine)
478
0
    {
479
0
        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
480
481
0
        int cuIdx = (cuGeom.childOffset - 1) / 3;
482
0
        bestCUCost = origCUCost = cacheCost[cuIdx];
483
484
0
        int direction = m_param->bOptCUDeltaQP ? 1 : 2;
485
486
0
        for (int dir = direction; dir >= -direction; dir -= (direction * 2))
487
0
        {
488
0
            if (m_param->bOptCUDeltaQP && ((dir != 1) || ((qp + 3) >= (int32_t)parentCTU.m_meanQP)))
489
0
                break;
490
491
0
            int threshold = 1;
492
0
            int failure = 0;
493
0
            cuPrevCost = origCUCost;
494
495
0
            int modCUQP = qp + dir;
496
0
            while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC)
497
0
            {
498
0
                if (m_param->bOptCUDeltaQP && modCUQP > (int32_t)parentCTU.m_meanQP)
499
0
                    break;
500
501
0
                recodeCU(parentCTU, cuGeom, modCUQP, qp);
502
0
                cuCost = md.bestMode->rdCost;
503
504
0
                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
505
0
                if (cuCost < cuPrevCost)
506
0
                    failure = 0;
507
0
                else
508
0
                    failure++;
509
510
0
                if (failure > threshold)
511
0
                    break;
512
513
0
                cuPrevCost = cuCost;
514
0
                modCUQP += dir;
515
0
            }
516
0
        }
517
0
        lambdaQP = bestCUQP;
518
0
    }
519
520
0
    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
521
522
    /* Copy best data to encData CTU and recon */
523
0
    md.bestMode->cu.copyToPic(depth);
524
0
    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], parentCTU.m_cuAddr, cuGeom.absPartIdx);
525
0
}
526
527
#if ENABLE_SCC_EXT
528
uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
529
#else
530
uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
531
#endif
532
0
{
533
0
    uint32_t depth = cuGeom.depth;
534
0
    ModeDepth& md = m_modeDepth[depth];
535
0
    md.bestMode = NULL;
536
537
0
    MV iMVCandList[4][10];
538
0
    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
539
540
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
541
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
542
543
0
    bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
544
0
    bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
545
0
    int split = 0;
546
0
    if (m_param->intraRefine && m_param->intraRefine != 4)
547
0
    {
548
0
        split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
549
0
            ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
550
0
        if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
551
0
            bAlreadyDecided = false;
552
0
    }
553
554
0
    if (bAlreadyDecided)
555
0
    {
556
0
        if (bDecidedDepth && mightNotSplit)
557
0
        {
558
0
            Mode& mode = md.pred[0];
559
0
            md.bestMode = &mode;
560
0
            mode.cu.initSubCU(parentCTU, cuGeom, qp);
561
0
            bool reuseModes = !((m_param->intraRefine == 3) ||
562
0
                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
563
0
            if (reuseModes)
564
0
            {
565
0
                memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
566
0
                memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
567
0
            }
568
0
            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
569
570
0
            if (m_bTryLossless)
571
0
                tryLossless(cuGeom);
572
573
0
            if (mightSplit)
574
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
575
0
        }
576
0
    }
577
0
    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
578
0
    {
579
0
        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
580
0
        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
581
0
        checkBestMode(md.pred[PRED_INTRA], depth);
582
583
0
        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
584
0
        {
585
0
            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
586
0
            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
587
0
            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
588
0
        }
589
590
#if ENABLE_SCC_EXT
591
        bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false;
592
        if (m_param->bEnableSCC)
593
        {
594
            md.pred[PRED_MERGE_IBC].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
595
            checkRDCostIntraBCMerge2Nx2N(md.pred[PRED_MERGE_IBC], cuGeom);
596
597
            md.pred[PRED_IBC_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
598
            checkIntraBC_rd5_6(md.pred[PRED_IBC_2Nx2N], cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
599
            checkBestMode(md.pred[PRED_IBC_2Nx2N], depth);
600
601
            if (intraBlockCopyFastSearch)
602
            {
603
                if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
604
                {
605
                    md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
606
                    checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
607
                    checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
608
609
                    md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
610
                    checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
611
                    checkBestMode(md.pred[PRED_IBC_2NxN], depth);
612
                }
613
            }
614
            else
615
            {
616
                md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
617
                checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
618
                checkBestMode(md.pred[PRED_IBC_2NxN], depth);
619
620
                md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
621
                checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
622
                checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
623
            }
624
        }
625
#endif
626
627
0
        if (m_bTryLossless)
628
0
            tryLossless(cuGeom);
629
630
0
        if (mightSplit)
631
0
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
632
0
    }
633
634
#if ENABLE_SCC_EXT
635
    // If Intra BC keep last coded Mv
636
    if (md.bestMode && md.bestMode->cu.isInter(0))
637
    {
638
        MVField mvField;
639
        const CUData* cu = &md.bestMode->cu;
640
        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
641
        int iRefIdxFirst = mvField.refIdx;
642
        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
643
        int iRefIdxLast = mvField.refIdx;
644
        bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxFirst]->m_poc == cu->m_slice->m_poc : false;
645
        bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxLast]->m_poc == cu->m_slice->m_poc : false;
646
647
        if (isIntraBCFirst || isIntraBCLast)
648
        {
649
            if (cu->m_partSize[0] == SIZE_2Nx2N)
650
            {
651
                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
652
                if (mvField.mv != cu->m_lastIntraBCMv[0])
653
                {
654
                    md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
655
                    md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
656
                }
657
            }
658
            else if (cu->m_partSize[0] == SIZE_2NxN || cu->m_partSize[0] == SIZE_Nx2N)
659
            {
660
                // mixed PU, only one partition is IntraBC coded
661
                if (isIntraBCFirst != isIntraBCLast)
662
                {
663
                    if (isIntraBCFirst)
664
                    {
665
                        // Part 0
666
                        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
667
                        if (mvField.mv != cu->m_lastIntraBCMv[0])
668
                        {
669
                            md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
670
                            md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
671
                        }
672
                    }
673
                    else if (isIntraBCLast)
674
                    {
675
                        // Part 1
676
                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
677
                        if (mvField.mv != cu->m_lastIntraBCMv[0])
678
                        {
679
                            md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
680
                            md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
681
                        }
682
                    }
683
                }
684
                else // normal IntraBC CU
685
                {
686
                    // Part 0
687
                    md.bestMode->cu.getMvField(cu, 0, 0, mvField);
688
                    if (mvField.mv != cu->m_lastIntraBCMv[0])
689
                    {
690
                        md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
691
                        md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
692
                    }
693
                    // Part 1
694
                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
695
                    if (mvField.mv != cu->m_lastIntraBCMv[0])
696
                    {
697
                        md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
698
                        md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
699
                    }
700
                }
701
            }
702
            else
703
            {
704
                // NxN
705
                for (int part = 0; part < 4; part++)
706
                {
707
                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
708
                    if (mvField.mv != cu->m_lastIntraBCMv[0])
709
                    {
710
                        md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
711
                        md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
712
                    }
713
                }
714
            }
715
        }
716
    } // is inter
717
#endif
718
719
    // stop recursion if we reach the depth of previous analysis decision
720
0
    mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
721
722
0
    if (mightSplit)
723
0
    {
724
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
725
0
        splitPred->initCosts();
726
0
        CUData* splitCU = &splitPred->cu;
727
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
728
729
0
        uint32_t nextDepth = depth + 1;
730
0
        ModeDepth& nd = m_modeDepth[nextDepth];
731
0
        invalidateContexts(nextDepth);
732
0
        Entropy* nextContext = &m_rqt[depth].cur;
733
0
        int32_t nextQP = qp;
734
0
        uint64_t curCost = 0;
735
0
        int skipSplitCheck = 0;
736
737
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
738
0
        {
739
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
740
0
            if (childGeom.flags & CUGeom::PRESENT)
741
0
            {
742
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
743
0
                m_rqt[nextDepth].cur.load(*nextContext);
744
745
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
746
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
747
748
0
                if (m_param->bEnableSplitRdSkip)
749
0
                {
750
#if ENABLE_SCC_EXT
751
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP, ibc);
752
#else
753
0
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP);
754
0
#endif
755
0
                    if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
756
0
                    {
757
0
                        skipSplitCheck = 1;
758
0
                        break;
759
0
                    }
760
0
                }
761
0
                else
762
763
0
#if !ENABLE_SCC_EXT
764
0
                    compressIntraCU(parentCTU, childGeom, nextQP);
765
#else
766
                    compressIntraCU(parentCTU, childGeom, nextQP, ibc);
767
768
                if (nd.bestMode->cu.m_lastIntraBCMv[0].x != 0 || nd.bestMode->cu.m_lastIntraBCMv[0].y != 0)
769
                {
770
                    for (int i = 0; i < 2; i++)
771
                    {
772
                        ibc->m_lastIntraBCMv[i] = nd.bestMode->cu.m_lastIntraBCMv[i];
773
                    }
774
                }
775
#endif
776
777
                // Save best CU and pred data for this sub CU
778
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
779
0
                splitPred->addSubCosts(*nd.bestMode);
780
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
781
0
                nextContext = &nd.bestMode->contexts;
782
0
            }
783
0
            else
784
0
            {
785
                /* record the depth of this non-present sub-CU */
786
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
787
788
                /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
789
0
                if (bAlreadyDecided)
790
0
                    memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
791
0
            }
792
0
        }
793
0
        if (!skipSplitCheck)
794
0
        {
795
0
            nextContext->store(splitPred->contexts);
796
0
            if (mightNotSplit)
797
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
798
0
            else
799
0
                updateModeCost(*splitPred);
800
801
0
            checkDQPForSplitPred(*splitPred, cuGeom);
802
0
            checkBestMode(*splitPred, depth);
803
0
        }
804
0
    }
805
806
0
    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
807
0
    {
808
0
        int cuIdx = (cuGeom.childOffset - 1) / 3;
809
0
        cacheCost[cuIdx] = md.bestMode->rdCost;
810
0
    }
811
812
0
    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
813
0
    {
814
0
        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
815
0
        int8_t maxTUDepth = -1;
816
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
817
0
            maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
818
0
        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
819
0
    }
820
821
    /* Copy best data to encData CTU and recon */
822
0
    md.bestMode->cu.copyToPic(depth);
823
0
    if (md.bestMode != &md.pred[PRED_SPLIT])
824
0
    {
825
0
        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
826
0
            md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[i], parentCTU.m_cuAddr, cuGeom.absPartIdx);
827
0
    }
828
829
0
    return md.bestMode->rdCost;
830
0
}
831
832
void Analysis::PMODE::processTasks(int workerThreadId)
833
0
{
834
#if DETAILED_CU_STATS
835
    int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
836
    master.m_stats[fe].countPModeTasks++;
837
    ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
838
#endif
839
0
    ProfileScopeEvent(pmode);
840
0
    master.processPmode(*this, master.m_tld[workerThreadId].analysis);
841
0
}
842
843
/* process pmode jobs until none remain; may be called by the master thread or by
844
 * a bonded peer (slave) thread via pmodeTasks() */
845
void Analysis::processPmode(PMODE& pmode, Analysis& slave)
846
0
{
847
    /* acquire a mode task, else exit early */
848
0
    int task;
849
0
    pmode.m_lock.acquire();
850
0
    if (pmode.m_jobTotal > pmode.m_jobAcquired)
851
0
    {
852
0
        task = pmode.m_jobAcquired++;
853
0
        pmode.m_lock.release();
854
0
    }
855
0
    else
856
0
    {
857
0
        pmode.m_lock.release();
858
0
        return;
859
0
    }
860
861
0
    ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
862
863
    /* setup slave Analysis */
864
0
    if (&slave != this)
865
0
    {
866
0
        slave.m_slice = m_slice;
867
0
        slave.m_frame = m_frame;
868
0
        slave.m_param = m_param;
869
0
        slave.m_bChromaSa8d = m_param->rdLevel >= 3;
870
0
        slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
871
0
        slave.invalidateContexts(0);
872
0
        slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
873
0
    }
874
875
    /* perform Mode task, repeat until no more work is available */
876
0
    do
877
0
    {
878
0
        uint32_t refMasks[2] = { 0, 0 };
879
880
0
        if (m_param->rdLevel <= 4)
881
0
        {
882
0
            switch (pmode.modes[task])
883
0
            {
884
0
            case PRED_INTRA:
885
0
                slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
886
0
                if (m_param->rdLevel > 2)
887
0
                    slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
888
0
                break;
889
890
0
            case PRED_2Nx2N:
891
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
892
893
0
                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
894
0
                if (m_slice->m_sliceType == B_SLICE)
895
0
                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
896
0
                break;
897
898
0
            case PRED_Nx2N:
899
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
900
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
901
902
0
                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
903
0
                break;
904
905
0
            case PRED_2NxN:
906
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
907
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
908
909
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
910
0
                break;
911
912
0
            case PRED_2NxnU:
913
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
914
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
915
916
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
917
0
                break;
918
919
0
            case PRED_2NxnD:
920
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
921
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
922
923
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
924
0
                break;
925
926
0
            case PRED_nLx2N:
927
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
928
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
929
930
0
                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
931
0
                break;
932
933
0
            case PRED_nRx2N:
934
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
935
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
936
937
0
                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
938
0
                break;
939
940
0
            default:
941
0
                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
942
0
                break;
943
0
            }
944
0
        }
945
0
        else
946
0
        {
947
0
            switch (pmode.modes[task])
948
0
            {
949
0
            case PRED_INTRA:
950
0
                slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N);
951
0
                if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
952
0
                    slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN);
953
0
                break;
954
955
0
            case PRED_2Nx2N:
956
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
957
958
0
                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
959
0
                md.pred[PRED_BIDIR].rdCost = MAX_INT64;
960
0
                if (m_slice->m_sliceType == B_SLICE)
961
0
                {
962
0
                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
963
0
                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
964
0
                        slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
965
0
                }
966
0
                break;
967
968
0
            case PRED_Nx2N:
969
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
970
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
971
972
0
                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
973
0
                break;
974
975
0
            case PRED_2NxN:
976
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
977
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
978
979
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
980
0
                break;
981
982
0
            case PRED_2NxnU:
983
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
984
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
985
986
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
987
0
                break;
988
989
0
            case PRED_2NxnD:
990
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
991
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
992
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
993
0
                break;
994
995
0
            case PRED_nLx2N:
996
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
997
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
998
999
0
                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
1000
0
                break;
1001
1002
0
            case PRED_nRx2N:
1003
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
1004
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
1005
0
                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
1006
0
                break;
1007
1008
0
            default:
1009
0
                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
1010
0
                break;
1011
0
            }
1012
0
        }
1013
1014
0
        task = -1;
1015
0
        pmode.m_lock.acquire();
1016
0
        if (pmode.m_jobTotal > pmode.m_jobAcquired)
1017
0
            task = pmode.m_jobAcquired++;
1018
0
        pmode.m_lock.release();
1019
0
    }
1020
0
    while (task >= 0);
1021
0
}
1022
1023
uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1024
0
{
1025
0
    uint32_t depth = cuGeom.depth;
1026
0
    uint32_t cuAddr = parentCTU.m_cuAddr;
1027
0
    ModeDepth& md = m_modeDepth[depth];
1028
0
    md.bestMode = NULL;
1029
1030
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1031
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1032
0
    uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
1033
0
    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
1034
1035
0
    X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
1036
1037
0
    PMODE pmode(*this, cuGeom);
1038
1039
0
    if (mightNotSplit && depth >= minDepth)
1040
0
    {
1041
        /* Initialize all prediction CUs based on parentCTU */
1042
0
        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1043
0
        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1044
1045
0
        if (m_param->rdLevel <= 4)
1046
0
            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1047
0
        else
1048
0
            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1049
0
    }
1050
1051
0
    bool bNoSplit = false;
1052
0
    bool splitIntra = true;
1053
0
    if (md.bestMode)
1054
0
    {
1055
0
        bNoSplit = md.bestMode->cu.isSkipped(0);
1056
0
        if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
1057
0
            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1058
0
    }
1059
1060
0
    if (mightSplit && !bNoSplit)
1061
0
    {
1062
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
1063
0
        splitPred->initCosts();
1064
0
        CUData* splitCU = &splitPred->cu;
1065
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
1066
1067
0
        uint32_t nextDepth = depth + 1;
1068
0
        ModeDepth& nd = m_modeDepth[nextDepth];
1069
0
        invalidateContexts(nextDepth);
1070
0
        Entropy* nextContext = &m_rqt[depth].cur;
1071
0
        int nextQP = qp;
1072
0
        splitIntra = false;
1073
1074
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1075
0
        {
1076
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1077
0
            if (childGeom.flags & CUGeom::PRESENT)
1078
0
            {
1079
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1080
0
                m_rqt[nextDepth].cur.load(*nextContext);
1081
1082
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1083
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1084
1085
0
                splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
1086
1087
                // Save best CU and pred data for this sub CU
1088
0
                splitIntra |= nd.bestMode->cu.isIntra(0);
1089
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1090
0
                splitPred->addSubCosts(*nd.bestMode);
1091
1092
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1093
0
                nextContext = &nd.bestMode->contexts;
1094
0
            }
1095
0
            else
1096
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
1097
0
        }
1098
0
        nextContext->store(splitPred->contexts);
1099
1100
0
        if (mightNotSplit)
1101
0
            addSplitFlagCost(*splitPred, cuGeom.depth);
1102
0
        else
1103
0
            updateModeCost(*splitPred);
1104
1105
0
        checkDQPForSplitPred(*splitPred, cuGeom);
1106
0
    }
1107
1108
0
    if (mightNotSplit && depth >= minDepth)
1109
0
    {
1110
0
        int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
1111
0
        int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE);
1112
1113
0
        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1114
0
            setLambdaFromQP(parentCTU, qp);
1115
1116
0
        if (bTryIntra)
1117
0
        {
1118
0
            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1119
0
            if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
1120
0
                md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1121
0
            pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
1122
0
        }
1123
0
        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
1124
0
        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1125
0
        if (m_param->bEnableRectInter)
1126
0
        {
1127
0
            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
1128
0
            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
1129
0
        }
1130
0
        if (bTryAmp)
1131
0
        {
1132
0
            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
1133
0
            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
1134
0
            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
1135
0
            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
1136
0
        }
1137
1138
0
        m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
1139
1140
0
        pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
1141
1142
        /* participate in processing jobs, until all are distributed */
1143
0
        processPmode(pmode, *this);
1144
1145
        /* the master worker thread (this one) does merge analysis. By doing
1146
         * merge after all the other jobs are at least started, we usually avoid
1147
         * blocking on another thread */
1148
1149
0
        if (m_param->rdLevel <= 4)
1150
0
        {
1151
0
            {
1152
0
                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1153
0
                pmode.waitForExit();
1154
0
            }
1155
1156
            /* select best inter mode based on sa8d cost */
1157
0
            Mode *bestInter = &md.pred[PRED_2Nx2N];
1158
1159
0
            if (m_param->bEnableRectInter)
1160
0
            {
1161
0
                if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1162
0
                    bestInter = &md.pred[PRED_Nx2N];
1163
0
                if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1164
0
                    bestInter = &md.pred[PRED_2NxN];
1165
0
            }
1166
1167
0
            if (bTryAmp)
1168
0
            {
1169
0
                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1170
0
                    bestInter = &md.pred[PRED_2NxnU];
1171
0
                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1172
0
                    bestInter = &md.pred[PRED_2NxnD];
1173
0
                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1174
0
                    bestInter = &md.pred[PRED_nLx2N];
1175
0
                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1176
0
                    bestInter = &md.pred[PRED_nRx2N];
1177
0
            }
1178
1179
0
            if (m_param->rdLevel > 2)
1180
0
            {
1181
                /* RD selection between merge, inter, bidir and intra */
1182
0
                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1183
0
                {
1184
0
                    uint32_t numPU = bestInter->cu.getNumPartInter(0);
1185
0
                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1186
0
                    {
1187
0
                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1188
0
                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1189
0
                    }
1190
0
                }
1191
0
                encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1192
0
                checkBestMode(*bestInter, depth);
1193
1194
                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1195
0
                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1196
0
                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1197
0
                {
1198
0
                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1199
0
                    checkBestMode(md.pred[PRED_BIDIR], depth);
1200
0
                }
1201
1202
0
                if (bTryIntra)
1203
0
                    checkBestMode(md.pred[PRED_INTRA], depth);
1204
0
            }
1205
0
            else /* m_param->rdLevel == 2 */
1206
0
            {
1207
0
                if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1208
0
                    md.bestMode = bestInter;
1209
1210
0
                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1211
0
                    md.bestMode = &md.pred[PRED_BIDIR];
1212
1213
0
                if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1214
0
                {
1215
0
                    md.bestMode = &md.pred[PRED_INTRA];
1216
0
                    encodeIntraInInter(*md.bestMode, cuGeom);
1217
0
                }
1218
0
                else if (!md.bestMode->cu.m_mergeFlag[0])
1219
0
                {
1220
                    /* finally code the best mode selected from SA8D costs */
1221
0
                    uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1222
0
                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1223
0
                    {
1224
0
                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1225
0
                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1226
0
                    }
1227
0
                    encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1228
0
                }
1229
0
            }
1230
0
        }
1231
0
        else
1232
0
        {
1233
0
            {
1234
0
                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1235
0
                pmode.waitForExit();
1236
0
            }
1237
1238
0
            checkBestMode(md.pred[PRED_2Nx2N], depth);
1239
0
            if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1240
0
                checkBestMode(md.pred[PRED_BIDIR], depth);
1241
1242
0
            if (m_param->bEnableRectInter)
1243
0
            {
1244
0
                checkBestMode(md.pred[PRED_Nx2N], depth);
1245
0
                checkBestMode(md.pred[PRED_2NxN], depth);
1246
0
            }
1247
1248
0
            if (bTryAmp)
1249
0
            {
1250
0
                checkBestMode(md.pred[PRED_2NxnU], depth);
1251
0
                checkBestMode(md.pred[PRED_2NxnD], depth);
1252
0
                checkBestMode(md.pred[PRED_nLx2N], depth);
1253
0
                checkBestMode(md.pred[PRED_nRx2N], depth);
1254
0
            }
1255
1256
0
            if (bTryIntra)
1257
0
            {
1258
0
                checkBestMode(md.pred[PRED_INTRA], depth);
1259
0
                if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1260
0
                    checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1261
0
            }
1262
0
        }
1263
1264
0
        if (m_bTryLossless)
1265
0
            tryLossless(cuGeom);
1266
1267
0
        if (mightSplit)
1268
0
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
1269
0
    }
1270
1271
    /* compare split RD cost against best cost */
1272
0
    if (mightSplit && !bNoSplit)
1273
0
        checkBestMode(md.pred[PRED_SPLIT], depth);
1274
1275
    /* determine which motion references the parent CU should search */
1276
0
    uint32_t refMask;
1277
0
    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1278
0
        refMask = 0;
1279
0
    else if (md.bestMode == &md.pred[PRED_SPLIT])
1280
0
        refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1281
0
    else
1282
0
    {
1283
        /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1284
0
        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1285
0
        uint32_t numPU = cu.getNumPartInter(0);
1286
0
        refMask = 0;
1287
0
        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1288
0
            refMask |= cu.getBestRefIdx(subPartIdx);
1289
0
    }
1290
1291
0
    if (mightNotSplit)
1292
0
    {
1293
        /* early-out statistics */
1294
0
        FrameData& curEncData = *m_frame->m_encData;
1295
0
        FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1296
0
        uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1297
0
        cuStat.count[depth] += 1;
1298
0
        cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1299
0
    }
1300
1301
    /* Copy best data to encData CTU and recon */
1302
0
    md.bestMode->cu.copyToPic(depth);
1303
0
    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], cuAddr, cuGeom.absPartIdx);
1304
1305
0
    return refMask;
1306
0
}
1307
1308
SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1309
0
{
1310
0
    if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))
1311
0
        return compressInterCU_rd5_6(parentCTU, cuGeom, qp);
1312
1313
0
    uint32_t depth = cuGeom.depth;
1314
0
    uint32_t cuAddr = parentCTU.m_cuAddr;
1315
0
    ModeDepth& md = m_modeDepth[depth];
1316
1317
1318
0
    if (m_param->searchMethod == X265_SEA)
1319
0
    {
1320
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
1321
0
        int offset = (int)(m_frame->m_reconPic[0]->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic[0]->m_buOffsetY[cuGeom.absPartIdx]);
1322
0
        for (int list = 0; list < numPredDir; list++)
1323
0
            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1324
0
                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1325
0
                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1326
0
    }
1327
1328
0
    PicYuv& reconPic = *m_frame->m_reconPic[0];
1329
0
    SplitData splitCUData;
1330
1331
0
    bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1332
0
    bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1333
0
    bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1334
1335
0
    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1336
0
    {
1337
0
        md.bestMode = NULL;
1338
0
        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1339
0
        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1340
0
        uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
1341
0
        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1342
0
        bool skipModes = false; /* Skip any remaining mode analyses at current depth */
1343
0
        bool skipRecursion = false; /* Skip recursion */
1344
0
        bool splitIntra = true;
1345
0
        bool skipRectAmp = false;
1346
0
        bool chooseMerge = false;
1347
0
        bool bCtuInfoCheck = false;
1348
0
        int sameContentRef = 0;
1349
1350
0
        if (m_evaluateInter)
1351
0
        {
1352
0
            if (m_refineLevel == 2)
1353
0
            {
1354
0
                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1355
0
                    skipModes = true;
1356
0
                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1357
0
                    skipRectAmp = true;
1358
0
            }
1359
0
            mightSplit &= false;
1360
0
            minDepth = depth;
1361
0
        }
1362
1363
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1364
0
            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1365
1366
0
        SplitData splitData[4];
1367
0
        splitData[0].initSplitCUData();
1368
0
        splitData[1].initSplitCUData();
1369
0
        splitData[2].initSplitCUData();
1370
0
        splitData[3].initSplitCUData();
1371
1372
        // avoid uninitialize value in below reference
1373
0
        if (m_param->limitModes)
1374
0
        {
1375
0
            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1376
0
            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1377
0
            md.pred[PRED_2Nx2N].sa8dCost = 0;
1378
0
        }
1379
1380
0
        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1381
0
        {
1382
0
            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1383
0
                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1384
0
            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1385
0
            {
1386
0
                mightNotSplit &= bDecidedDepth;
1387
0
                bCtuInfoCheck = skipRecursion = false;
1388
0
                skipModes = true;
1389
0
            }
1390
0
            else if (mightNotSplit && bDecidedDepth)
1391
0
            {
1392
0
                if (m_additionalCtuInfo[cuGeom.absPartIdx])
1393
0
                {
1394
0
                    bCtuInfoCheck = skipRecursion = true;
1395
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1396
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1397
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1398
0
                    if (!sameContentRef)
1399
0
                    {
1400
0
                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1401
0
                        {
1402
0
                            qp -= int32_t(0.04 * qp);
1403
0
                            setLambdaFromQP(parentCTU, qp);
1404
0
                        }
1405
0
                        if (m_param->bCTUInfo & 4)
1406
0
                            skipModes = false;
1407
0
                    }
1408
0
                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1409
0
                    {
1410
0
                        if (m_param->rdLevel)
1411
0
                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1412
0
                        if ((m_param->bCTUInfo & 4) && sameContentRef)
1413
0
                            skipModes = md.bestMode && true;
1414
0
                    }
1415
0
                }
1416
0
                else
1417
0
                {
1418
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1419
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1420
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1421
0
                    if (m_param->rdLevel)
1422
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1423
0
                }
1424
0
                mightSplit &= !bDecidedDepth;
1425
0
            }
1426
0
        }
1427
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10))
1428
0
        {
1429
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1430
0
            {
1431
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1432
0
                {
1433
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1434
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1435
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1436
1437
0
                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1438
0
                    if (m_param->rdLevel)
1439
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
1440
0
                }
1441
0
                if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1442
0
                {
1443
0
                    if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
1444
0
                    {
1445
0
                        skipRectAmp = true && !!md.bestMode;
1446
0
                        chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
1447
0
                    }
1448
0
                }
1449
0
            }
1450
0
        }
1451
0
        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1452
0
        {
1453
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1454
0
            {
1455
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1456
0
                {
1457
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1458
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1459
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1460
1461
0
                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1462
0
                    if (m_param->rdLevel)
1463
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
1464
0
                }
1465
0
            }
1466
0
        }
1467
        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
1468
0
        if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1469
            /* TODO: Re-evaluate if analysis load/save still works */
1470
0
        {
1471
            /* Compute Merge Cost */
1472
0
            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1473
0
            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1474
0
            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1475
0
            if (m_param->rdLevel)
1476
0
                skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
1477
0
                && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
1478
0
        }
1479
0
        if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1480
0
        {
1481
0
            skipRecursion = md.bestMode->cu.isSkipped(0);
1482
0
            if (mightSplit && !skipRecursion)
1483
0
            {
1484
0
                if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
1485
0
                {
1486
0
                    if (depth)
1487
0
                        skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1488
0
                    if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
1489
0
                        skipRecursion = complexityCheckCU(*md.bestMode);
1490
0
                }
1491
0
                else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
1492
0
                {
1493
0
                    skipRecursion = complexityCheckCU(*md.bestMode);
1494
0
                }
1495
1496
0
            }
1497
0
        }
1498
0
        if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
1499
0
            skipRecursion = true;
1500
        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
1501
0
        if (mightSplit && !skipRecursion)
1502
0
        {
1503
0
            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
1504
0
                qp = int((1 / 0.96) * qp + 0.5);
1505
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
1506
0
            splitPred->initCosts();
1507
0
            CUData* splitCU = &splitPred->cu;
1508
0
            splitCU->initSubCU(parentCTU, cuGeom, qp);
1509
1510
0
            uint32_t nextDepth = depth + 1;
1511
0
            ModeDepth& nd = m_modeDepth[nextDepth];
1512
0
            invalidateContexts(nextDepth);
1513
0
            Entropy* nextContext = &m_rqt[depth].cur;
1514
0
            int nextQP = qp;
1515
0
            splitIntra = false;
1516
1517
0
            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1518
0
            {
1519
0
                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1520
0
                if (childGeom.flags & CUGeom::PRESENT)
1521
0
                {
1522
0
                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1523
0
                    m_rqt[nextDepth].cur.load(*nextContext);
1524
1525
0
                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1526
0
                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1527
1528
0
                    splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
1529
1530
                    // Save best CU and pred data for this sub CU
1531
0
                    splitIntra |= nd.bestMode->cu.isIntra(0);
1532
0
                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1533
0
                    splitPred->addSubCosts(*nd.bestMode);
1534
1535
0
                    if (m_param->rdLevel)
1536
0
                        nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1537
0
                    else
1538
0
                        nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
1539
0
                    if (m_param->rdLevel > 1)
1540
0
                        nextContext = &nd.bestMode->contexts;
1541
0
                }
1542
0
                else
1543
0
                    splitCU->setEmptyPart(childGeom, subPartIdx);
1544
0
            }
1545
0
            nextContext->store(splitPred->contexts);
1546
1547
0
            if (mightNotSplit)
1548
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
1549
0
            else if (m_param->rdLevel > 1)
1550
0
                updateModeCost(*splitPred);
1551
0
            else
1552
0
                splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
1553
0
        }
1554
        /* If analysis mode is simple do not Evaluate other modes */
1555
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
1556
0
        {
1557
0
            if (m_slice->m_sliceType == P_SLICE)
1558
0
            {
1559
0
                if (m_checkMergeAndSkipOnly[0])
1560
0
                    skipModes = true;
1561
0
            }
1562
0
            else
1563
0
            {
1564
0
                if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
1565
0
                    skipModes = true;
1566
0
            }
1567
0
        }
1568
        /* Split CUs
1569
         *   0  1
1570
         *   2  3 */
1571
0
        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1572
        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1573
0
        if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
1574
0
        {
1575
0
            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1576
0
                setLambdaFromQP(parentCTU, qp);
1577
1578
0
            if (!skipModes)
1579
0
            {
1580
0
                uint32_t refMasks[2];
1581
0
                refMasks[0] = allSplitRefs;
1582
0
                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1583
0
                checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1584
1585
0
                if (m_param->limitReferences & X265_REF_LIMIT_CU)
1586
0
                {
1587
0
                    CUData& cu = md.pred[PRED_2Nx2N].cu;
1588
0
                    uint32_t refMask = cu.getBestRefIdx(0);
1589
0
                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
1590
0
                }
1591
1592
0
                if (m_slice->m_sliceType == B_SLICE)
1593
0
                {
1594
0
                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1595
0
                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1596
0
                }
1597
1598
0
                Mode *bestInter = &md.pred[PRED_2Nx2N];
1599
0
                if (!skipRectAmp)
1600
0
                {
1601
0
                    if (m_param->bEnableRectInter)
1602
0
                    {
1603
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1604
0
                        uint32_t threshold_2NxN, threshold_Nx2N;
1605
1606
0
                        if (m_slice->m_sliceType == P_SLICE)
1607
0
                        {
1608
0
                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1609
0
                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1610
0
                        }
1611
0
                        else
1612
0
                        {
1613
0
                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1614
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1615
0
                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1616
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1617
0
                        }
1618
1619
0
                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
1620
0
                        if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1621
0
                        {
1622
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1623
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1624
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1625
0
                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1626
0
                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1627
0
                                bestInter = &md.pred[PRED_2NxN];
1628
0
                        }
1629
1630
0
                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
1631
0
                        {
1632
0
                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
1633
0
                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
1634
0
                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1635
0
                            checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1636
0
                            if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1637
0
                                bestInter = &md.pred[PRED_Nx2N];
1638
0
                        }
1639
1640
0
                        if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1641
0
                        {
1642
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1643
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1644
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1645
0
                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1646
0
                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1647
0
                                bestInter = &md.pred[PRED_2NxN];
1648
0
                        }
1649
0
                    }
1650
1651
0
                    if (m_slice->m_sps->maxAMPDepth > depth)
1652
0
                    {
1653
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1654
0
                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
1655
1656
0
                        if (m_slice->m_sliceType == P_SLICE)
1657
0
                        {
1658
0
                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1659
0
                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
1660
1661
0
                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1662
0
                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
1663
0
                        }
1664
0
                        else
1665
0
                        {
1666
0
                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1667
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1668
0
                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
1669
0
                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1670
1671
0
                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1672
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1673
0
                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
1674
0
                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1675
0
                        }
1676
1677
0
                        bool bHor = false, bVer = false;
1678
0
                        if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
1679
0
                            bHor = true;
1680
0
                        else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
1681
0
                            bVer = true;
1682
0
                        else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
1683
0
                            md.bestMode && md.bestMode->cu.getQtRootCbf(0))
1684
0
                        {
1685
0
                            bHor = true;
1686
0
                            bVer = true;
1687
0
                        }
1688
1689
0
                        if (bHor)
1690
0
                        {
1691
0
                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1692
0
                            if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1693
0
                            {
1694
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
1695
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1696
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1697
0
                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1698
0
                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1699
0
                                    bestInter = &md.pred[PRED_2NxnD];
1700
0
                            }
1701
1702
0
                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
1703
0
                            {
1704
0
                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1705
0
                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
1706
0
                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1707
0
                                checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1708
0
                                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1709
0
                                    bestInter = &md.pred[PRED_2NxnU];
1710
0
                            }
1711
1712
0
                            if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1713
0
                            {
1714
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
1715
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1716
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1717
0
                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1718
0
                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1719
0
                                    bestInter = &md.pred[PRED_2NxnD];
1720
0
                            }
1721
0
                        }
1722
0
                        if (bVer)
1723
0
                        {
1724
0
                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1725
0
                            if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1726
0
                            {
1727
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
1728
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1729
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1730
0
                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1731
0
                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1732
0
                                    bestInter = &md.pred[PRED_nRx2N];
1733
0
                            }
1734
1735
0
                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
1736
0
                            {
1737
0
                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
1738
0
                                refMasks[1] = allSplitRefs;                                    /* 75% right */
1739
0
                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1740
0
                                checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1741
0
                                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1742
0
                                    bestInter = &md.pred[PRED_nLx2N];
1743
0
                            }
1744
1745
0
                            if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1746
0
                            {
1747
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
1748
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1749
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1750
0
                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1751
0
                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1752
0
                                    bestInter = &md.pred[PRED_nRx2N];
1753
0
                            }
1754
0
                        }
1755
0
                    }
1756
0
                }
1757
0
                bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
1758
0
                if (m_param->rdLevel >= 3)
1759
0
                {
1760
                    /* Calculate RD cost of best inter option */
1761
0
                    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1762
0
                    {
1763
0
                        uint32_t numPU = bestInter->cu.getNumPartInter(0);
1764
0
                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1765
0
                        {
1766
0
                            PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1767
0
                            motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1768
0
                        }
1769
0
                    }
1770
1771
0
                    if (!chooseMerge)
1772
0
                    {
1773
0
                        encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1774
0
                        checkBestMode(*bestInter, depth);
1775
1776
                        /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1777
0
                        if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1778
0
                            md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1779
0
                        {
1780
0
                            uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
1781
0
                            if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
1782
0
                                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1783
0
                                {
1784
0
                                    PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
1785
0
                                    motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
1786
0
                                }
1787
0
                            encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1788
0
                            checkBestMode(md.pred[PRED_BIDIR], depth);
1789
0
                        }
1790
0
                    }
1791
1792
0
                    if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
1793
0
                        md.bestMode->sa8dCost == MAX_INT64)
1794
0
                    {
1795
0
                        if (!m_param->limitReferences || splitIntra)
1796
0
                        {
1797
0
                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1798
0
                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1799
0
                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1800
0
                            encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
1801
0
                            checkBestMode(md.pred[PRED_INTRA], depth);
1802
0
                        }
1803
0
                        else
1804
0
                        {
1805
0
                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1806
0
                        }
1807
0
                    }
1808
0
                }
1809
0
                else
1810
0
                {
1811
                    /* SA8D choice between merge/skip, inter, bidir, and intra */
1812
0
                    if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1813
0
                        md.bestMode = bestInter;
1814
1815
0
                    if (m_slice->m_sliceType == B_SLICE &&
1816
0
                        md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1817
0
                        md.bestMode = &md.pred[PRED_BIDIR];
1818
1819
0
                    if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
1820
0
                    {
1821
0
                        if (!m_param->limitReferences || splitIntra)
1822
0
                        {
1823
0
                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1824
0
                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1825
0
                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1826
0
                            if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1827
0
                                md.bestMode = &md.pred[PRED_INTRA];
1828
0
                        }
1829
0
                        else
1830
0
                        {
1831
0
                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1832
0
                        }
1833
0
                    }
1834
1835
                    /* finally code the best mode selected by SA8D costs:
1836
                     * RD level 2 - fully encode the best mode
1837
                     * RD level 1 - generate recon pixels
1838
                     * RD level 0 - generate chroma prediction */
1839
0
                    if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
1840
0
                    {
1841
                        /* prediction already generated for this CU, and if rd level
1842
                         * is not 0, it is already fully encoded */
1843
0
                    }
1844
0
                    else if (md.bestMode->cu.isInter(0))
1845
0
                    {
1846
0
                        uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1847
0
                        if (m_csp != X265_CSP_I400)
1848
0
                        {
1849
0
                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1850
0
                            {
1851
0
                                PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1852
0
                                motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1853
0
                            }
1854
0
                        }
1855
0
                        if (m_param->rdLevel == 2)
1856
0
                            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1857
0
                        else if (m_param->rdLevel == 1)
1858
0
                        {
1859
                            /* generate recon pixels with no rate distortion considerations */
1860
0
                            CUData& cu = md.bestMode->cu;
1861
1862
0
                            uint32_t tuDepthRange[2];
1863
0
                            cu.getInterTUQtDepthRange(tuDepthRange, 0);
1864
0
                            m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
1865
0
                            residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1866
0
                            if (cu.getQtRootCbf(0))
1867
0
                                md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
1868
0
                            else
1869
0
                            {
1870
0
                                md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
1871
0
                                if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
1872
0
                                    cu.setPredModeSubParts(MODE_SKIP);
1873
0
                            }
1874
0
                        }
1875
0
                    }
1876
0
                    else
1877
0
                    {
1878
0
                        if (m_param->rdLevel == 2)
1879
0
                            encodeIntraInInter(*md.bestMode, cuGeom);
1880
0
                        else if (m_param->rdLevel == 1)
1881
0
                        {
1882
                            /* generate recon pixels with no rate distortion considerations */
1883
0
                            CUData& cu = md.bestMode->cu;
1884
1885
0
                            uint32_t tuDepthRange[2];
1886
0
                            cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1887
1888
0
                            residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1889
0
                            if (m_csp != X265_CSP_I400)
1890
0
                            {
1891
0
                                getBestIntraModeChroma(*md.bestMode, cuGeom);
1892
0
                                residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
1893
0
                            }
1894
0
                            md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
1895
0
                        }
1896
0
                    }
1897
0
                }
1898
0
            } // !earlyskip
1899
1900
0
            if (m_bTryLossless)
1901
0
                tryLossless(cuGeom);
1902
1903
0
            if (mightSplit)
1904
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
1905
0
        }
1906
1907
0
        if (mightSplit && !skipRecursion)
1908
0
        {
1909
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
1910
0
            if (!md.bestMode)
1911
0
                md.bestMode = splitPred;
1912
0
            else if (m_param->rdLevel > 1)
1913
0
                checkBestMode(*splitPred, cuGeom.depth);
1914
0
            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1915
0
                md.bestMode = splitPred;
1916
1917
0
            checkDQPForSplitPred(*md.bestMode, cuGeom);
1918
0
        }
1919
1920
        /* determine which motion references the parent CU should search */
1921
0
        splitCUData.initSplitCUData();
1922
1923
0
        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1924
0
        {
1925
0
            if (md.bestMode == &md.pred[PRED_SPLIT])
1926
0
                splitCUData.splitRefs = allSplitRefs;
1927
0
            else
1928
0
            {
1929
                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1930
0
                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1931
0
                uint32_t numPU = cu.getNumPartInter(0);
1932
0
                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1933
0
                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1934
0
            }
1935
0
        }
1936
1937
0
        if (m_param->limitModes)
1938
0
        {
1939
0
            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1940
0
            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1941
0
            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1942
0
        }
1943
1944
0
        if (mightNotSplit && md.bestMode->cu.isSkipped(0))
1945
0
        {
1946
0
            FrameData& curEncData = *m_frame->m_encData;
1947
0
            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1948
0
            uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1949
0
            cuStat.count[depth] += 1;
1950
0
            cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1951
0
        }
1952
1953
        /* Copy best data to encData CTU and recon */
1954
0
        md.bestMode->cu.copyToPic(depth);
1955
0
        if (m_param->rdLevel)
1956
0
            md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
1957
1958
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1959
0
        {
1960
0
            if (mightNotSplit)
1961
0
            {
1962
0
                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
1963
0
                int8_t maxTUDepth = -1;
1964
0
                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
1965
0
                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
1966
0
                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
1967
0
            }
1968
0
        }
1969
0
    }
1970
0
    else
1971
0
    {
1972
0
        if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
1973
0
        {
1974
0
            qprdRefine(parentCTU, cuGeom, qp, qp);
1975
1976
0
            SplitData splitData[4];
1977
0
            splitData[0].initSplitCUData();
1978
0
            splitData[1].initSplitCUData();
1979
0
            splitData[2].initSplitCUData();
1980
0
            splitData[3].initSplitCUData();
1981
1982
0
            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1983
1984
0
            splitCUData.initSplitCUData();
1985
1986
0
            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1987
0
            {
1988
0
                if (md.bestMode == &md.pred[PRED_SPLIT])
1989
0
                    splitCUData.splitRefs = allSplitRefs;
1990
0
                else
1991
0
                {
1992
                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1993
0
                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1994
0
                    uint32_t numPU = cu.getNumPartInter(0);
1995
0
                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1996
0
                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1997
0
                }
1998
0
            }
1999
2000
0
            if (m_param->limitModes)
2001
0
            {
2002
0
                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2003
0
                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2004
0
                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
2005
0
            }
2006
0
        }
2007
0
    }
2008
2009
0
    return splitCUData;
2010
0
}
2011
2012
#if ENABLE_SCC_EXT
2013
SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
2014
#else
2015
SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
2016
#endif
2017
0
{
2018
0
    if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
2019
0
        return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2020
2021
0
    uint32_t depth = cuGeom.depth;
2022
0
    ModeDepth& md = m_modeDepth[depth];
2023
0
    md.bestMode = NULL;
2024
2025
0
    Mode* interBest = NULL; // store the best modes in inter prediction
2026
2027
0
    MV iMVCandList[4][10];
2028
0
    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
2029
2030
0
    if (m_param->searchMethod == X265_SEA)
2031
0
    {
2032
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
2033
0
        int offset = (int)(m_frame->m_reconPic[0]->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic[0]->m_buOffsetY[cuGeom.absPartIdx]);
2034
0
        for (int list = 0; list < numPredDir; list++)
2035
0
            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
2036
0
                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2037
0
                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
2038
0
    }
2039
2040
0
    SplitData splitCUData;
2041
2042
0
    bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
2043
0
    bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
2044
0
    bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
2045
2046
0
    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
2047
0
    {
2048
0
        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2049
0
        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2050
0
        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2051
0
        bool skipRecursion = false;
2052
0
        bool skipModes = false;
2053
0
        bool splitIntra = true;
2054
0
        bool skipRectAmp = false;
2055
0
        bool bCtuInfoCheck = false;
2056
0
        int sameContentRef = 0;
2057
2058
0
        if (m_evaluateInter)
2059
0
        {
2060
0
            if (m_refineLevel == 2)
2061
0
            {
2062
0
                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
2063
0
                    skipModes = true;
2064
0
                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
2065
0
                    skipRectAmp = true;
2066
0
            }
2067
0
            mightSplit &= false;
2068
0
        }
2069
2070
        // avoid uninitialize value in below reference
2071
0
        if (m_param->limitModes)
2072
0
        {
2073
0
            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
2074
0
            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
2075
0
            md.pred[PRED_2Nx2N].rdCost = 0;
2076
0
        }
2077
2078
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2079
0
            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
2080
2081
0
        SplitData splitData[4];
2082
0
        splitData[0].initSplitCUData();
2083
0
        splitData[1].initSplitCUData();
2084
0
        splitData[2].initSplitCUData();
2085
0
        splitData[3].initSplitCUData();
2086
0
        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2087
0
        uint32_t refMasks[2];
2088
0
        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
2089
0
        {
2090
0
            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
2091
0
                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
2092
0
            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
2093
0
            {
2094
0
                mightNotSplit &= bDecidedDepth;
2095
0
                bCtuInfoCheck = skipRecursion = false;
2096
0
                skipModes = true;
2097
0
            }
2098
0
            else if (mightNotSplit && bDecidedDepth)
2099
0
            {
2100
0
                if (m_additionalCtuInfo[cuGeom.absPartIdx])
2101
0
                {
2102
0
                    bCtuInfoCheck = skipRecursion = true;
2103
0
                    refMasks[0] = allSplitRefs;
2104
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2105
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2106
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2107
0
                    if (!sameContentRef)
2108
0
                    {
2109
0
                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
2110
0
                        {
2111
0
                            qp -= int32_t(0.04 * qp);
2112
0
                            setLambdaFromQP(parentCTU, qp);
2113
0
                        }
2114
0
                        if (m_param->bCTUInfo & 4)
2115
0
                            skipModes = false;
2116
0
                    }
2117
0
                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
2118
0
                    {
2119
0
                        if (m_param->rdLevel)
2120
0
                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
2121
0
                        if ((m_param->bCTUInfo & 4) && sameContentRef)
2122
0
                            skipModes = md.bestMode && true;
2123
0
                    }
2124
0
                }
2125
0
                else
2126
0
                {
2127
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2128
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2129
0
                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2130
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2131
0
                    refMasks[0] = allSplitRefs;
2132
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2133
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2134
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2135
#if ENABLE_SCC_EXT
2136
                    interBest = md.bestMode;
2137
#endif
2138
0
                }
2139
0
                mightSplit &= !bDecidedDepth;
2140
0
            }
2141
0
        }
2142
0
        if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2143
0
        {
2144
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
2145
0
            {
2146
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
2147
0
                {
2148
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2149
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2150
0
                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2151
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2152
0
                    refMasks[0] = allSplitRefs;
2153
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2154
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2155
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2156
2157
0
                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2158
0
                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2159
#if ENABLE_SCC_EXT
2160
                    interBest = md.bestMode;
2161
#endif
2162
0
                }
2163
0
                if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
2164
0
                    skipRectAmp = true && !!md.bestMode;
2165
0
            }
2166
0
        }
2167
2168
0
        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
2169
0
        {
2170
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
2171
0
            {
2172
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
2173
0
                {
2174
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2175
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2176
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2177
2178
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2179
0
                    refMasks[0] = allSplitRefs;
2180
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2181
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2182
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2183
2184
0
                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2185
0
                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2186
#if ENABLE_SCC_EXT
2187
                    interBest = md.bestMode;
2188
#endif
2189
0
                }
2190
0
            }
2191
0
        }
2192
        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
2193
0
        if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) ||
2194
0
            (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
2195
0
        {
2196
0
            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2197
0
            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2198
0
            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2199
0
            skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
2200
0
                md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2201
0
            refMasks[0] = allSplitRefs;
2202
0
            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2203
0
            checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2204
0
            checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2205
2206
#if ENABLE_SCC_EXT
2207
            interBest = md.bestMode;
2208
            if (m_param->bEnableSCC)
2209
            {
2210
                md.pred[PRED_MERGE_IBC].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2211
                checkRDCostIntraBCMerge2Nx2N(md.pred[PRED_MERGE_IBC], cuGeom);
2212
            }
2213
#endif
2214
2215
0
            if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
2216
0
                skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2217
0
            else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
2218
0
                skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
2219
0
        }
2220
0
        if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
2221
0
            skipRecursion = true;
2222
        // estimate split cost
2223
        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
2224
0
        if (mightSplit && !skipRecursion)
2225
0
        {
2226
0
            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
2227
0
                qp = int((1 / 0.96) * qp + 0.5);
2228
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
2229
0
            splitPred->initCosts();
2230
0
            CUData* splitCU = &splitPred->cu;
2231
0
            splitCU->initSubCU(parentCTU, cuGeom, qp);
2232
2233
0
            uint32_t nextDepth = depth + 1;
2234
0
            ModeDepth& nd = m_modeDepth[nextDepth];
2235
0
            invalidateContexts(nextDepth);
2236
0
            Entropy* nextContext = &m_rqt[depth].cur;
2237
0
            int nextQP = qp;
2238
0
            splitIntra = false;
2239
2240
0
            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2241
0
            {
2242
0
                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2243
0
                if (childGeom.flags & CUGeom::PRESENT)
2244
0
                {
2245
0
                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2246
0
                    m_rqt[nextDepth].cur.load(*nextContext);
2247
2248
0
                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2249
0
                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2250
2251
2252
#if ENABLE_SCC_EXT
2253
                    splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP, ibc);
2254
2255
                    if (nd.bestMode->cu.m_lastIntraBCMv[0].x != 0 || nd.bestMode->cu.m_lastIntraBCMv[0].y != 0)
2256
                    {
2257
                        for (int i = 0; i < 2; i++)
2258
                            ibc->m_lastIntraBCMv[i] = nd.bestMode->cu.m_lastIntraBCMv[i];
2259
                    }
2260
#else
2261
0
                    splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
2262
0
#endif
2263
2264
                    // Save best CU and pred data for this sub CU
2265
0
                    splitIntra |= nd.bestMode->cu.isIntra(0);
2266
0
                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2267
0
                    splitPred->addSubCosts(*nd.bestMode);
2268
0
                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2269
0
                    nextContext = &nd.bestMode->contexts;
2270
0
                }
2271
0
                else
2272
0
                {
2273
0
                    splitCU->setEmptyPart(childGeom, subPartIdx);
2274
0
                }
2275
0
            }
2276
0
            nextContext->store(splitPred->contexts);
2277
0
            if (mightNotSplit)
2278
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
2279
0
            else
2280
0
                updateModeCost(*splitPred);
2281
2282
0
            checkDQPForSplitPred(*splitPred, cuGeom);
2283
0
        }
2284
        /* If analysis mode is simple do not Evaluate other modes */
2285
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2286
0
        {
2287
0
            if (m_slice->m_sliceType == P_SLICE)
2288
0
            {
2289
0
                if (m_checkMergeAndSkipOnly[0])
2290
0
                    skipModes = true;
2291
0
            }
2292
0
            else
2293
0
            {
2294
0
                if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
2295
0
                    skipModes = true;
2296
0
            }
2297
0
        }
2298
        /* Split CUs
2299
         *   0  1
2300
         *   2  3 */
2301
0
        allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2302
        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
2303
0
        if (mightNotSplit)
2304
0
        {
2305
0
            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
2306
0
                setLambdaFromQP(parentCTU, qp);
2307
2308
0
            if (!skipModes)
2309
0
            {
2310
0
                refMasks[0] = allSplitRefs;
2311
2312
0
                if (m_param->limitReferences & X265_REF_LIMIT_CU)
2313
0
                {
2314
0
                    CUData& cu = md.pred[PRED_2Nx2N].cu;
2315
0
                    uint32_t refMask = cu.getBestRefIdx(0);
2316
0
                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
2317
0
                }
2318
2319
0
                if (m_slice->m_sliceType == B_SLICE)
2320
0
                {
2321
0
                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
2322
0
                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
2323
0
                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
2324
0
                    {
2325
0
                        uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
2326
0
                        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
2327
0
                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2328
0
                            {
2329
0
                                PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
2330
0
                                motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
2331
0
                            }
2332
0
                        encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
2333
0
                        checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
2334
0
                    }
2335
0
                }
2336
2337
0
                if (!skipRectAmp)
2338
0
                {
2339
0
                    if (m_param->bEnableRectInter)
2340
0
                    {
2341
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2342
0
                        uint32_t threshold_2NxN, threshold_Nx2N;
2343
2344
0
                        if (m_slice->m_sliceType == P_SLICE)
2345
0
                        {
2346
0
                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2347
0
                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2348
0
                        }
2349
0
                        else
2350
0
                        {
2351
0
                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2352
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2353
0
                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2354
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2355
0
                        }
2356
2357
0
                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
2358
0
                        if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2359
0
                        {
2360
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2361
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2362
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2363
#if ENABLE_SCC_EXT
2364
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks, iMVCandList[SIZE_2NxN]);
2365
                            interBest = (md.pred[PRED_2NxN].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxN] : interBest;
2366
#else
2367
0
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2368
0
#endif
2369
0
                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2370
0
                        }
2371
2372
0
                        if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
2373
0
                        {
2374
0
                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
2375
0
                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
2376
0
                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2377
#if ENABLE_SCC_EXT
2378
                            checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks, iMVCandList[SIZE_Nx2N]);
2379
                            interBest = (md.pred[PRED_Nx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_Nx2N] : interBest;
2380
#else
2381
0
                            checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
2382
0
#endif
2383
0
                            checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
2384
0
                        }
2385
2386
0
                        if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2387
0
                        {
2388
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2389
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2390
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2391
#if ENABLE_SCC_EXT
2392
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks, iMVCandList[SIZE_2NxN]);
2393
                            interBest = (md.pred[PRED_2NxN].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxN] : interBest;
2394
#else
2395
0
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2396
0
#endif
2397
0
                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2398
0
                        }
2399
0
                    }
2400
2401
                    // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
2402
0
                    if (m_slice->m_sps->maxAMPDepth > depth)
2403
0
                    {
2404
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2405
0
                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
2406
2407
0
                        if (m_slice->m_sliceType == P_SLICE)
2408
0
                        {
2409
0
                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2410
0
                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
2411
2412
0
                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2413
0
                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
2414
0
                        }
2415
0
                        else
2416
0
                        {
2417
0
                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2418
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2419
0
                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
2420
0
                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2421
2422
0
                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2423
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2424
0
                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
2425
0
                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2426
0
                        }
2427
2428
0
                        bool bHor = false, bVer = false;
2429
0
                        if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
2430
0
                            bHor = true;
2431
0
                        else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
2432
0
                            bVer = true;
2433
0
                        else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
2434
0
                        {
2435
0
                            bHor = true;
2436
0
                            bVer = true;
2437
0
                        }
2438
2439
0
                        if (bHor)
2440
0
                        {
2441
0
                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
2442
0
                            if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2443
0
                            {
2444
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
2445
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2446
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2447
0
                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2448
0
                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2449
#if ENABLE_SCC_EXT
2450
                                interBest = (md.pred[PRED_2NxnD].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnD] : interBest;
2451
#endif
2452
0
                            }
2453
2454
0
                            if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
2455
0
                            {
2456
0
                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
2457
0
                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
2458
0
                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
2459
0
                                checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
2460
0
                                checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
2461
#if ENABLE_SCC_EXT
2462
                                interBest = (md.pred[PRED_2NxnU].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnU] : interBest;
2463
#endif
2464
0
                            }
2465
2466
0
                            if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2467
0
                            {
2468
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
2469
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2470
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2471
0
                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2472
0
                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2473
#if ENABLE_SCC_EXT
2474
                                interBest = (md.pred[PRED_2NxnD].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnD] : interBest;
2475
#endif
2476
0
                            }
2477
0
                        }
2478
2479
0
                        if (bVer)
2480
0
                        {
2481
0
                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
2482
0
                            if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2483
0
                            {
2484
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
2485
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2486
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2487
0
                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2488
0
                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2489
#if ENABLE_SCC_EXT
2490
                                interBest = (md.pred[PRED_nRx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nRx2N] : interBest;
2491
#endif
2492
0
                            }
2493
2494
0
                            if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
2495
0
                            {
2496
0
                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
2497
0
                                refMasks[1] = allSplitRefs;                                    /* 75% right */
2498
0
                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2499
0
                                checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
2500
0
                                checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
2501
#if ENABLE_SCC_EXT
2502
                                interBest = (md.pred[PRED_nLx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nLx2N] : interBest;
2503
#endif
2504
0
                            }
2505
2506
0
                            if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2507
0
                            {
2508
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
2509
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2510
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2511
0
                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2512
0
                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2513
#if ENABLE_SCC_EXT
2514
                                interBest = (md.pred[PRED_nRx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nRx2N] : interBest;
2515
#endif
2516
0
                            }
2517
0
                        }
2518
0
                    }
2519
0
                }
2520
2521
#if ENABLE_SCC_EXT
2522
                if (m_param->bEnableSCC)
2523
                {
2524
                    bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false, bValid;
2525
                    md.pred[PRED_IBC_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2526
                    checkIntraBC_rd5_6(md.pred[PRED_IBC_2Nx2N], cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
2527
                    checkBestMode(md.pred[PRED_IBC_2Nx2N], depth);
2528
2529
                    if (intraBlockCopyFastSearch)
2530
                    {
2531
                        if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
2532
                        {
2533
                            md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2534
                            checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
2535
                            checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
2536
2537
                            md.pred[PRED_MIXED_IBC_NX2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2538
                            bValid = predMixedIntraBCInterSearch(md.pred[PRED_MIXED_IBC_NX2N], cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, SIZE_Nx2N, iMVCandList[SIZE_Nx2N]);
2539
                            if (bValid)
2540
                                encodeResAndCalcRdInterCU(md.pred[PRED_MIXED_IBC_NX2N], cuGeom);
2541
                            else
2542
                                md.pred[PRED_MIXED_IBC_NX2N].rdCost = UINT64_MAX;
2543
                            checkBestMode(md.pred[PRED_MIXED_IBC_NX2N], depth);
2544
2545
                            md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2546
                            checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
2547
                            checkBestMode(md.pred[PRED_IBC_2NxN], depth);
2548
2549
                            md.pred[PRED_MIXED_IBC_2NXN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2550
                            bValid = predMixedIntraBCInterSearch(md.pred[PRED_MIXED_IBC_2NXN], cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, SIZE_2NxN, iMVCandList[SIZE_2NxN]);
2551
                            if (bValid)
2552
                                encodeResAndCalcRdInterCU(md.pred[PRED_MIXED_IBC_2NXN], cuGeom);
2553
                            else
2554
                                md.pred[PRED_MIXED_IBC_2NXN].rdCost = UINT64_MAX;
2555
                            checkBestMode(md.pred[PRED_MIXED_IBC_2NXN], depth);
2556
                        }
2557
                    }
2558
                    else // full search
2559
                    {
2560
                        md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2561
                        checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
2562
                        checkBestMode(md.pred[PRED_IBC_2NxN], depth);
2563
2564
                        md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2565
                        checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
2566
                        checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
2567
                    }
2568
                }
2569
#endif
2570
2571
0
                if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
2572
0
                {
2573
0
                    if (!m_param->limitReferences || splitIntra)
2574
0
                    {
2575
0
                        ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
2576
0
                        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
2577
0
                        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
2578
0
                        checkBestMode(md.pred[PRED_INTRA], depth);
2579
2580
0
                        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
2581
0
                        {
2582
0
                            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2583
0
                            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
2584
0
                            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
2585
0
                        }
2586
0
                    }
2587
0
                    else
2588
0
                    {
2589
0
                        ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
2590
0
                    }
2591
0
                }
2592
0
            }
2593
2594
#if ENABLE_SCC_EXT
2595
            // If Intra BC keep last coded Mv
2596
            if (md.bestMode->cu.isInter(0))
2597
            {
2598
                MVField mvField;
2599
                const CUData* cu = &md.bestMode->cu;
2600
                md.bestMode->cu.getMvField(cu, 0, 0, mvField);
2601
                int iRefIdxFirst = mvField.refIdx;
2602
                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2603
                int iRefIdxLast = mvField.refIdx;
2604
                bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxFirst]->m_poc == cu->m_slice->m_poc : false;
2605
                bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxLast]->m_poc == cu->m_slice->m_poc : false;
2606
2607
                if (isIntraBCFirst || isIntraBCLast)
2608
                {
2609
                    if (cu->m_partSize[0] == SIZE_2Nx2N)
2610
                    {
2611
                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2612
                        if (mvField.mv != cu->m_lastIntraBCMv[0])
2613
                        {
2614
                            md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2615
                            md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2616
                        }
2617
                    }
2618
                    else if (cu->m_partSize[0] == SIZE_2NxN || cu->m_partSize[0] == SIZE_Nx2N)
2619
                    {
2620
                        // mixed PU, only one partition is IntraBC coded
2621
                        if (isIntraBCFirst != isIntraBCLast)
2622
                        {
2623
                            if (isIntraBCFirst)
2624
                            {
2625
                                // Part 0
2626
                                md.bestMode->cu.getMvField(cu, 0, 0, mvField);
2627
                                if (mvField.mv != cu->m_lastIntraBCMv[0])
2628
                                {
2629
                                    md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2630
                                    md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2631
                                }
2632
                            }
2633
                            else if (isIntraBCLast)
2634
                            {
2635
                                // Part 1
2636
                                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2637
                                if (mvField.mv != cu->m_lastIntraBCMv[0])
2638
                                {
2639
                                    md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2640
                                    md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2641
                                }
2642
                            }
2643
                        }
2644
                        else // normal IntraBC CU
2645
                        {
2646
                            // Part 0
2647
                            md.bestMode->cu.getMvField(cu, 0, 0, mvField);
2648
                            if (mvField.mv != cu->m_lastIntraBCMv[0])
2649
                            {
2650
                                md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2651
                                md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2652
                            }
2653
                            // Part 1
2654
                            md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2655
                            if (mvField.mv != cu->m_lastIntraBCMv[0])
2656
                            {
2657
                                md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2658
                                md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2659
                            }
2660
                        }
2661
                    }
2662
                    else
2663
                    {
2664
                        // NxN
2665
                        for (int part = 0; part < 4; part++)
2666
                        {
2667
                            md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
2668
                            if (mvField.mv != cu->m_lastIntraBCMv[0])
2669
                            {
2670
                                md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2671
                                md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2672
                            }
2673
                        }
2674
                    }
2675
                }
2676
            } // is inter
2677
#endif
2678
2679
0
            if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
2680
0
            {
2681
0
                uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
2682
2683
0
                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2684
0
                {
2685
0
                    PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
2686
0
                    motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
2687
0
                }
2688
0
                encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
2689
0
            }
2690
0
            if (m_bTryLossless)
2691
0
                tryLossless(cuGeom);
2692
2693
0
            if (mightSplit)
2694
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
2695
0
        }
2696
2697
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2698
0
        {
2699
0
            if (mightNotSplit)
2700
0
            {
2701
0
                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
2702
0
                int8_t maxTUDepth = -1;
2703
0
                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
2704
0
                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
2705
0
                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
2706
0
            }
2707
0
        }
2708
2709
        /* compare split RD cost against best cost */
2710
0
        if (mightSplit && !skipRecursion)
2711
0
            checkBestMode(md.pred[PRED_SPLIT], depth);
2712
2713
0
        if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
2714
0
        {
2715
0
            int cuIdx = (cuGeom.childOffset - 1) / 3;
2716
0
            cacheCost[cuIdx] = md.bestMode->rdCost;
2717
0
        }
2718
2719
        /* determine which motion references the parent CU should search */
2720
0
        splitCUData.initSplitCUData();
2721
0
        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2722
0
        {
2723
0
            if (md.bestMode == &md.pred[PRED_SPLIT])
2724
0
                splitCUData.splitRefs = allSplitRefs;
2725
0
            else
2726
0
            {
2727
                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2728
0
                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : (m_param->bEnableSCC ? interBest->cu : md.bestMode->cu);
2729
0
                uint32_t numPU = cu.getNumPartInter(0);
2730
0
                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2731
0
                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2732
0
            }
2733
0
        }
2734
2735
0
        if (m_param->limitModes)
2736
0
        {
2737
0
            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2738
0
            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2739
0
            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2740
0
        }
2741
2742
        /* Copy best data to encData CTU and recon */
2743
0
        md.bestMode->cu.copyToPic(depth);
2744
0
        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
2745
0
            md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[i], parentCTU.m_cuAddr, cuGeom.absPartIdx);
2746
0
    }
2747
0
    else
2748
0
    {
2749
0
        if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
2750
0
        {
2751
0
            qprdRefine(parentCTU, cuGeom, qp, qp);
2752
2753
0
            SplitData splitData[4];
2754
0
            splitData[0].initSplitCUData();
2755
0
            splitData[1].initSplitCUData();
2756
0
            splitData[2].initSplitCUData();
2757
0
            splitData[3].initSplitCUData();
2758
2759
0
            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2760
2761
0
            splitCUData.initSplitCUData();
2762
0
            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2763
0
            {
2764
0
                if (md.bestMode == &md.pred[PRED_SPLIT])
2765
0
                    splitCUData.splitRefs = allSplitRefs;
2766
0
                else
2767
0
                {
2768
                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2769
0
                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2770
0
                    uint32_t numPU = cu.getNumPartInter(0);
2771
0
                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2772
0
                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2773
0
                }
2774
0
            }
2775
2776
0
            if (m_param->limitModes)
2777
0
            {
2778
0
                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2779
0
                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2780
0
                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2781
0
            }
2782
0
        }
2783
0
    }
2784
2785
0
    return splitCUData;
2786
0
}
2787
2788
void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
2789
0
{
2790
0
    uint32_t depth = cuGeom.depth;
2791
0
    ModeDepth& md = m_modeDepth[depth];
2792
0
    md.bestMode = NULL;
2793
2794
0
    m_evaluateInter = 0;
2795
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2796
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2797
0
    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2798
0
    int split = 0;
2799
2800
0
    TrainingData td;
2801
0
    td.init(parentCTU, cuGeom);
2802
2803
0
    if (!m_param->bDynamicRefine)
2804
0
        m_refineLevel = m_param->interRefine;
2805
0
    else
2806
0
        m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
2807
2808
0
    if (m_param->interRefine == 1)
2809
0
        split = (m_param->scaleFactor && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && (!mightNotSplit ||
2810
0
                (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2811
0
    else
2812
0
        split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
2813
0
                (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2814
0
    td.split = split;
2815
2816
0
    if ((bDecidedDepth && mightNotSplit) || (m_param->bAnalysisType == HEVC_INFO && parentCTU.m_cuDepth[cuGeom.absPartIdx] == 4))
2817
0
    {
2818
0
        setLambdaFromQP(parentCTU, qp, lqp);
2819
2820
0
        Mode& mode = md.pred[0];
2821
0
        md.bestMode = &mode;
2822
0
        mode.cu.initSubCU(parentCTU, cuGeom, qp);
2823
0
        PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
2824
0
        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2825
0
        {
2826
0
            if (m_param->intraRefine == 4)
2827
0
                compressIntraCU(parentCTU, cuGeom, qp);
2828
0
            else
2829
0
            {
2830
0
                bool reuseModes = !((m_param->intraRefine == 3) ||
2831
0
                    (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
2832
0
                if (reuseModes)
2833
0
                {
2834
0
                    memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2835
0
                    memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2836
0
                }
2837
0
                checkIntra(mode, cuGeom, size);
2838
0
            }
2839
0
        }
2840
0
        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2841
0
        {
2842
0
            mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
2843
0
            uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
2844
0
            for (uint32_t part = 0; part < numPU; part++)
2845
0
            {
2846
0
                PredictionUnit pu(mode.cu, cuGeom, part);
2847
0
                if (m_param->analysisLoadReuseLevel == 10 || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7))
2848
0
                {
2849
0
                    x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
2850
0
                    int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
2851
0
                    mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
2852
0
                    mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
2853
0
                    for (int list = 0; list < m_slice->isInterB() + 1; list++)
2854
0
                    {
2855
0
                        mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
2856
0
                        mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
2857
0
                        mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
2858
0
                    }
2859
0
                    if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
2860
0
                    {
2861
0
                        if (m_param->interRefine == 1)
2862
0
                            m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
2863
                        //AMVP
2864
0
                        MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2865
0
                        mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours);
2866
0
                        for (int list = 0; list < m_slice->isInterB() + 1; list++)
2867
0
                        {
2868
0
                            int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
2869
0
                            if (ref == -1)
2870
0
                                continue;
2871
0
                            MV mvp;
2872
2873
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2874
                            int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc, part, pu.puAbsPartIdx);
2875
#else
2876
0
                            int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
2877
0
#endif
2878
0
                            mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
2879
0
                            if (m_param->interRefine == 1)
2880
0
                            {
2881
0
                                MV outmv, mvpSelect[3];
2882
0
                                mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word;
2883
0
                                if (m_param->mvRefine > 1)
2884
0
                                {
2885
0
                                    mvpSelect[1] = mvp;
2886
0
                                    if(m_param->mvRefine > 2)
2887
0
                                        mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])];
2888
0
                                }
2889
0
                                searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc);
2890
0
                                mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
2891
0
                            }
2892
0
                            mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
2893
0
                        }
2894
0
                    }
2895
0
                    else
2896
0
                    {
2897
0
                        MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2898
0
                        uint8_t candDir[MRG_MAX_NUM_CANDS];
2899
0
                        mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
2900
0
                        uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
2901
0
                        if (mode.cu.isBipredRestriction())
2902
0
                        {
2903
                            /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2904
0
                            if (candDir[mvpIdx] == 3)
2905
0
                            {
2906
0
                                candDir[mvpIdx] = 1;
2907
0
                                candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
2908
0
                            }
2909
0
                        }
2910
0
                        mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
2911
0
                        mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
2912
0
                        mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
2913
0
                        mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part);
2914
0
                        mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part);
2915
0
                    }
2916
0
                }
2917
0
                motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2918
0
            }
2919
0
            if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
2920
0
                encodeResAndCalcRdSkipCU(mode);
2921
0
            else
2922
0
                encodeResAndCalcRdInterCU(mode, cuGeom);
2923
2924
            /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
2925
0
            bool mergeInter2Nx2N = size == SIZE_2Nx2N && mode.cu.m_mergeFlag[0];
2926
0
            if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
2927
0
                checkDQP(mode, cuGeom);
2928
0
        }
2929
2930
0
        if (m_refineLevel < 2)
2931
0
        {
2932
0
            if (m_bTryLossless)
2933
0
                tryLossless(cuGeom);
2934
2935
0
            if (mightSplit)
2936
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
2937
2938
0
            if (mightSplit && m_param->rdLevel < 5)
2939
0
                checkDQPForSplitPred(*md.bestMode, cuGeom);
2940
0
        }
2941
2942
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2943
0
        {
2944
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
2945
0
            {
2946
0
                m_modeFlag[list] = true;
2947
0
                if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16)
2948
0
                    m_checkMergeAndSkipOnly[list] = true;
2949
0
            }
2950
0
            m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2951
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
2952
0
            {
2953
0
                m_modeFlag[list] = false;
2954
0
                m_checkMergeAndSkipOnly[list] = false;
2955
0
            }
2956
0
        }
2957
2958
0
        if (m_param->bDynamicRefine)
2959
0
            classifyCU(parentCTU,cuGeom, *md.bestMode, td);
2960
2961
0
        if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
2962
0
        {
2963
0
            if ((m_slice->m_origSliceType != I_SLICE))
2964
0
            {
2965
0
                if (parentCTU.m_cuDepth[cuGeom.absPartIdx] < 4 && mightNotSplit)
2966
0
                    m_evaluateInter = 1;
2967
0
                else
2968
0
                    bDecidedDepth = true;
2969
0
                m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2970
0
                m_evaluateInter = 0;
2971
0
            }
2972
0
            else
2973
0
            {
2974
0
                compressIntraCU(parentCTU, cuGeom, qp);
2975
0
            }
2976
0
        }
2977
0
    }
2978
0
    if (!bDecidedDepth || split)
2979
0
    {
2980
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
2981
0
        if (!split)
2982
0
            md.bestMode = splitPred;
2983
0
        splitPred->initCosts();
2984
0
        CUData* splitCU = &splitPred->cu;
2985
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
2986
2987
0
        uint32_t nextDepth = depth + 1;
2988
0
        ModeDepth& nd = m_modeDepth[nextDepth];
2989
0
        invalidateContexts(nextDepth);
2990
0
        Entropy* nextContext = &m_rqt[depth].cur;
2991
0
        int nextQP = qp;
2992
2993
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2994
0
        {
2995
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2996
0
            if (childGeom.flags & CUGeom::PRESENT)
2997
0
            {
2998
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2999
0
                m_rqt[nextDepth].cur.load(*nextContext);
3000
3001
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
3002
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
3003
3004
0
                int lamdaQP = (m_param->analysisLoadReuseLevel >= 7) ? nextQP : lqp;
3005
3006
0
                if (split)
3007
0
                    m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
3008
0
                else
3009
0
                    qprdRefine(parentCTU, childGeom, nextQP, lamdaQP);
3010
3011
                // Save best CU and pred data for this sub CU
3012
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
3013
0
                splitPred->addSubCosts(*nd.bestMode);
3014
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
3015
0
                nextContext = &nd.bestMode->contexts;
3016
0
            }
3017
0
            else
3018
0
            {
3019
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
3020
                // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
3021
0
                memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
3022
0
            }
3023
0
        }
3024
0
        nextContext->store(splitPred->contexts);
3025
0
        if (mightNotSplit)
3026
0
            addSplitFlagCost(*splitPred, cuGeom.depth);
3027
0
        else
3028
0
            updateModeCost(*splitPred);
3029
3030
0
        if (m_refineLevel)
3031
0
        {
3032
0
            if (m_param->rdLevel > 1)
3033
0
                checkBestMode(*splitPred, cuGeom.depth);
3034
0
            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
3035
0
                md.bestMode = splitPred;
3036
0
        }
3037
3038
0
        checkDQPForSplitPred(*splitPred, cuGeom);
3039
3040
        /* Copy best data to encData CTU and recon */
3041
0
        md.bestMode->cu.copyToPic(depth);
3042
0
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], parentCTU.m_cuAddr, cuGeom.absPartIdx);
3043
0
    }
3044
0
    if (m_param->bDynamicRefine && bDecidedDepth)
3045
0
        trainCU(parentCTU, cuGeom, *md.bestMode, td);
3046
0
}
3047
3048
void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
3049
0
{
3050
0
    uint32_t depth = cuGeom.depth;
3051
0
    trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
3052
0
    if (m_frame->m_classifyFrame)
3053
0
    {
3054
0
        uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
3055
0
        uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
3056
0
        float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
3057
0
        uint8_t varRefineLevel = 1;
3058
0
        uint8_t rdRefineLevel = 1;
3059
0
        uint64_t cuCost = bestMode.rdCost;
3060
0
        int offset = (depth * X265_REFINE_INTER_LEVELS);
3061
0
        if (cuCost < m_frame->m_classifyRd[offset])
3062
0
            m_refineLevel = 1;
3063
0
        else
3064
0
        {
3065
0
            uint64_t trainingCount = 0;
3066
0
            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
3067
0
            {
3068
0
                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
3069
0
                trainingCount += m_frame->m_classifyCount[offset];
3070
0
            }
3071
0
            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
3072
0
            {
3073
0
                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
3074
                /* Calculate distance values */
3075
0
                diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
3076
0
                diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
3077
3078
                /* Calculate prior probability - ranges between 0 and 1 */
3079
0
                if (trainingCount)
3080
0
                    probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
3081
3082
                /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
3083
                P(c|x) is the posterior probability of class given predictor.
3084
                P(c) is the prior probability of class.
3085
                P(x|c) is the likelihood which is the probability of predictor given class.
3086
                P(x) is the prior probability of predictor.*/
3087
0
                int curRefineLevel = m_refineLevel - 1;
3088
0
                if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
3089
0
                    varRefineLevel = i + 1;
3090
0
                if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
3091
0
                    rdRefineLevel = i + 1;
3092
0
            }
3093
0
            m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
3094
0
        }
3095
0
    }
3096
0
}
3097
3098
void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
3099
0
{
3100
0
    uint32_t depth = cuGeom.depth;
3101
0
    int classify = 1;
3102
0
    if (!m_frame->m_classifyFrame)
3103
0
    {
3104
        /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
3105
                          and CUs that has split.
3106
           classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
3107
           classify = 3 : CUs encoded as any other mode. */
3108
3109
0
        bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
3110
0
            trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
3111
0
            trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
3112
0
        bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
3113
0
        if (refineInter0 || refineInter1)
3114
0
            classify = 1;
3115
0
        else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
3116
0
            classify = 2;
3117
0
        else
3118
0
            classify = 3;
3119
0
    }
3120
0
    else
3121
0
        classify = m_refineLevel;
3122
0
    uint64_t cuCost = bestMode.rdCost;
3123
0
    int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
3124
0
    ctu.m_collectCURd[offset] += cuCost;
3125
0
    ctu.m_collectCUVariance[offset] += trainData.cuVariance;
3126
0
    ctu.m_collectCUCount[offset]++;
3127
0
}
3128
3129
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
3130
void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
3131
0
{
3132
0
    uint32_t depth = cuGeom.depth;
3133
0
    ModeDepth& md = m_modeDepth[depth];
3134
0
    Yuv *fencYuv = &md.fencYuv;
3135
3136
    /* Note that these two Mode instances are named MERGE and SKIP but they may
3137
     * hold the reverse when the function returns. We toggle between the two modes */
3138
0
    Mode* tempPred = &merge;
3139
0
    Mode* bestPred = &skip;
3140
3141
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
3142
3143
0
    tempPred->initCosts();
3144
0
    tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
3145
0
    tempPred->cu.setPredModeSubParts(MODE_INTER);
3146
0
    tempPred->cu.m_mergeFlag[0] = true;
3147
3148
0
    bestPred->initCosts();
3149
0
    bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
3150
0
    bestPred->cu.setPredModeSubParts(MODE_INTER);
3151
0
    bestPred->cu.m_mergeFlag[0] = true;
3152
3153
0
    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3154
0
    uint8_t candDir[MRG_MAX_NUM_CANDS];
3155
0
    uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
3156
0
    PredictionUnit pu(merge.cu, cuGeom, 0);
3157
3158
0
    bestPred->sa8dCost = MAX_INT64;
3159
0
    int bestSadCand = -1;
3160
0
    int sizeIdx = cuGeom.log2CUSize - 2;
3161
0
    int safeX, maxSafeMv;
3162
0
    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
3163
0
    {
3164
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
3165
0
        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
3166
0
    }
3167
0
    for (uint32_t i = 0; i < numMergeCand; ++i)
3168
0
    {
3169
0
        if (m_bFrameParallel)
3170
0
        {
3171
            // Parallel slices bound check
3172
0
            if (m_param->maxSlices > 1)
3173
0
            {
3174
                // NOTE: First row in slice can't negative
3175
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y < m_sliceMinY)
3176
0
                    continue;
3177
3178
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y < m_sliceMinY)
3179
0
                    continue;
3180
3181
                // Last row in slice can't reference beyond bound since it is another slice area
3182
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
3183
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y > m_sliceMaxY)
3184
0
                    continue;
3185
3186
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y > m_sliceMaxY)
3187
0
                    continue;
3188
0
            }
3189
3190
0
            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
3191
0
                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
3192
0
                continue;
3193
0
        }
3194
3195
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
3196
0
            tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
3197
0
            candMvField[i][0].mv.x > maxSafeMv)
3198
            // skip merge candidates which reference beyond safe reference area
3199
0
            continue;
3200
3201
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
3202
0
        X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
3203
0
        tempPred->cu.m_interDir[0] = candDir[i];
3204
0
        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
3205
0
        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
3206
0
        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
3207
0
        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
3208
0
        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3209
3210
0
        tempPred->sa8dBits = getTUBits(i, numMergeCand);
3211
0
        tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
3212
0
        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3213
0
        {
3214
0
            tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
3215
0
            tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
3216
0
        }
3217
0
        tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
3218
3219
0
        if (tempPred->sa8dCost < bestPred->sa8dCost)
3220
0
        {
3221
0
            bestSadCand = i;
3222
0
            std::swap(tempPred, bestPred);
3223
0
        }
3224
0
    }
3225
3226
    /* force mode decision to take inter or intra */
3227
0
    if (bestSadCand < 0)
3228
0
        return;
3229
3230
    /* calculate the motion compensation for chroma for the best mode selected */
3231
0
    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
3232
0
        motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
3233
3234
0
    if (m_param->rdLevel)
3235
0
    {
3236
0
        if (m_param->bLossless)
3237
0
            bestPred->rdCost = MAX_INT64;
3238
0
        else
3239
0
            encodeResAndCalcRdSkipCU(*bestPred);
3240
3241
        /* Encode with residual */
3242
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
3243
0
        tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
3244
0
        tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
3245
0
        tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
3246
0
        tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
3247
0
        tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
3248
0
        tempPred->sa8dCost = bestPred->sa8dCost;
3249
0
        tempPred->sa8dBits = bestPred->sa8dBits;
3250
0
        tempPred->predYuv.copyFromYuv(bestPred->predYuv);
3251
3252
0
        encodeResAndCalcRdInterCU(*tempPred, cuGeom);
3253
3254
0
        md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
3255
0
    }
3256
0
    else
3257
0
        md.bestMode = bestPred;
3258
3259
    /* broadcast sets of MV field data */
3260
0
    md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
3261
0
    md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
3262
0
    md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
3263
0
    md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
3264
0
    md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
3265
0
    checkDQP(*md.bestMode, cuGeom);
3266
0
}
3267
3268
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
3269
void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
3270
0
{
3271
0
    uint32_t depth = cuGeom.depth;
3272
3273
    /* Note that these two Mode instances are named MERGE and SKIP but they may
3274
     * hold the reverse when the function returns. We toggle between the two modes */
3275
0
    Mode* tempPred = &merge;
3276
0
    Mode* bestPred = &skip;
3277
3278
0
    merge.initCosts();
3279
0
    merge.cu.setPredModeSubParts(MODE_INTER);
3280
0
    merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
3281
0
    merge.cu.m_mergeFlag[0] = true;
3282
3283
0
    skip.initCosts();
3284
0
    skip.cu.setPredModeSubParts(MODE_INTER);
3285
0
    skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
3286
0
    skip.cu.m_mergeFlag[0] = true;
3287
3288
0
    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3289
0
    uint8_t candDir[MRG_MAX_NUM_CANDS];
3290
0
    uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
3291
#if ENABLE_SCC_EXT
3292
    restrictBipredMergeCand(&merge.cu, 0, candMvField, candDir, numMergeCand);
3293
#endif
3294
3295
0
    PredictionUnit pu(merge.cu, cuGeom, 0);
3296
3297
0
    bool foundCbf0Merge = false;
3298
0
    bool triedPZero = false, triedBZero = false;
3299
0
    bestPred->rdCost = MAX_INT64;
3300
3301
0
    int safeX, maxSafeMv;
3302
0
    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
3303
0
    {
3304
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
3305
0
        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
3306
0
    }
3307
0
    for (uint32_t i = 0; i < numMergeCand; i++)
3308
0
    {
3309
0
        if (m_bFrameParallel)
3310
0
        {
3311
            // Parallel slices bound check
3312
0
            if (m_param->maxSlices > 1)
3313
0
            {
3314
                // NOTE: First row in slice can't negative
3315
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y < m_sliceMinY)
3316
0
                    continue;
3317
3318
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y < m_sliceMinY)
3319
0
                    continue;
3320
3321
                // Last row in slice can't reference beyond bound since it is another slice area
3322
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
3323
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y > m_sliceMaxY)
3324
0
                    continue;
3325
3326
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y > m_sliceMaxY)
3327
0
                    continue;
3328
0
            }
3329
3330
0
            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
3331
0
                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
3332
0
                continue;
3333
0
        }
3334
3335
        /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
3336
0
        if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
3337
0
        {
3338
0
            if (triedPZero)
3339
0
                continue;
3340
0
            triedPZero = true;
3341
0
        }
3342
0
        else if (candDir[i] == 3 &&
3343
0
            !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
3344
0
            !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
3345
0
        {
3346
0
            if (triedBZero)
3347
0
                continue;
3348
0
            triedBZero = true;
3349
0
        }
3350
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
3351
0
            tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
3352
0
            candMvField[i][0].mv.x > maxSafeMv)
3353
            // skip merge candidates which reference beyond safe reference area
3354
0
            continue;
3355
#if ENABLE_SCC_EXT
3356
        if ((candDir[i] == 1 || candDir[i] == 3) && (m_slice->m_refPOCList[0][candMvField[i][0].refIdx] == m_slice->m_poc))
3357
        {
3358
            continue;
3359
        }
3360
#endif
3361
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
3362
0
        tempPred->cu.m_interDir[0] = candDir[i];
3363
0
        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
3364
0
        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
3365
0
        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
3366
0
        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
3367
0
        tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
3368
3369
0
        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
3370
3371
0
        uint8_t hasCbf = true;
3372
0
        bool swapped = false;
3373
0
        if (!foundCbf0Merge)
3374
0
        {
3375
            /* if the best prediction has CBF (not a skip) then try merge with residual */
3376
3377
0
            encodeResAndCalcRdInterCU(*tempPred, cuGeom);
3378
0
            hasCbf = tempPred->cu.getQtRootCbf(0);
3379
0
            foundCbf0Merge = !hasCbf;
3380
3381
0
            if (tempPred->rdCost < bestPred->rdCost)
3382
0
            {
3383
0
                std::swap(tempPred, bestPred);
3384
0
                swapped = true;
3385
0
            }
3386
0
        }
3387
0
        if (!m_param->bLossless && hasCbf)
3388
0
        {
3389
            /* try merge without residual (skip), if not lossless coding */
3390
3391
0
            if (swapped)
3392
0
            {
3393
0
                tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
3394
0
                tempPred->cu.m_interDir[0] = candDir[i];
3395
0
                tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
3396
0
                tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
3397
0
                tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
3398
0
                tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
3399
0
                tempPred->cu.setPredModeSubParts(MODE_INTER);
3400
0
                tempPred->predYuv.copyFromYuv(bestPred->predYuv);
3401
0
            }
3402
3403
0
            encodeResAndCalcRdSkipCU(*tempPred);
3404
3405
0
            if (tempPred->rdCost < bestPred->rdCost)
3406
0
                std::swap(tempPred, bestPred);
3407
0
        }
3408
0
    }
3409
3410
0
    if (bestPred->rdCost < MAX_INT64)
3411
0
    {
3412
0
        m_modeDepth[depth].bestMode = bestPred;
3413
3414
        /* broadcast sets of MV field data */
3415
0
        uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
3416
0
        bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
3417
0
        bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
3418
0
        bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
3419
0
        bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
3420
0
        bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
3421
0
        checkDQP(*bestPred, cuGeom);
3422
0
    }
3423
0
}
3424
3425
#if ENABLE_SCC_EXT
3426
void Analysis::checkRDCostIntraBCMerge2Nx2N(Mode& mergeIBC, const CUGeom& cuGeom)
3427
{
3428
    mergeIBC.initCosts();
3429
    MVField  cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3430
    uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
3431
    uint32_t numValidMergeCand = 0;
3432
    CUData cu = mergeIBC.cu;
3433
    PredictionUnit pu(mergeIBC.cu, cuGeom, 0);
3434
    mergeIBC.rdCost = MAX_INT64;
3435
    for (uint32_t ui = 0; ui < m_slice->m_maxNumMergeCand; ++ui)
3436
    {
3437
        interDirNeighbours[ui] = 0;
3438
    }
3439
    int8_t org_qp;
3440
    int xPos = cu.m_cuPelX;
3441
    int yPos = cu.m_cuPelY;
3442
    int width = 1 << cu.m_log2CUSize[0];
3443
    int height = 1 << cu.m_log2CUSize[0];
3444
    uint8_t depth = cu.m_cuDepth[0];
3445
    mergeIBC.cu.setPartSizeSubParts(SIZE_2Nx2N);
3446
    Mode tempPred = m_modeDepth[depth].pred[PRED_MERGE_IBC];
3447
3448
    numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, interDirNeighbours);
3449
    cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand);
3450
    restrictBipredMergeCand(&cu, 0, cMvFieldNeighbours, interDirNeighbours, numValidMergeCand);
3451
3452
    for (uint8_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
3453
    {
3454
        if (interDirNeighbours[mergeCand] != 1)
3455
        {
3456
            continue;
3457
        }
3458
3459
        if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mergeCand][0].refIdx] != m_slice->m_poc)
3460
        {
3461
            continue;
3462
        }
3463
3464
        if (!isBlockVectorValid(xPos, yPos, width, height, &cu,
3465
            0, 0, (cMvFieldNeighbours[mergeCand][0].mv.x >> 2), (cMvFieldNeighbours[mergeCand][0].mv.y >> 2), m_param->maxCUSize))
3466
        {
3467
            continue;
3468
        }
3469
3470
        // set MC parameters
3471
        cu.setPredModeSubParts(MODE_INTER);
3472
        cu.setPartSizeSubParts(SIZE_2Nx2N);
3473
        cu.m_mergeFlag[0] = true;
3474
        cu.m_mvpIdx[0][0] = mergeCand;
3475
        cu.setPUInterDir(interDirNeighbours[mergeCand], 0, 0);
3476
        cu.setPUMv(0, cMvFieldNeighbours[mergeCand][0].mv, 0, 0);
3477
        cu.setPUMv(1, cMvFieldNeighbours[mergeCand][1].mv, 0, 0);
3478
        cu.setPURefIdx(0, (int8_t)cMvFieldNeighbours[mergeCand][0].refIdx, 0, 0);
3479
        cu.setPURefIdx(1, (int8_t)cMvFieldNeighbours[mergeCand][1].refIdx, 0, 0);
3480
        motionCompensation(cu, pu, mergeIBC.predYuv, true, m_csp != X265_CSP_I400);
3481
3482
        org_qp = cu.m_qp[0];
3483
        encodeResAndCalcRdInterCU(mergeIBC, cuGeom);
3484
        if (mergeIBC.rdCost < tempPred.rdCost)
3485
            std::swap(mergeIBC, tempPred);
3486
        cu.setQPSubParts(org_qp, 0, depth);
3487
    }
3488
    std::swap(tempPred, mergeIBC);
3489
    checkBestMode(mergeIBC, depth);
3490
    checkDQP(mergeIBC, cuGeom);
3491
}
3492
#endif
3493
3494
void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3495
0
{
3496
0
    interMode.initCosts();
3497
0
    interMode.cu.setPartSizeSubParts(partSize);
3498
0
    interMode.cu.setPredModeSubParts(MODE_INTER);
3499
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3500
3501
0
    if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3502
0
    {
3503
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3504
0
        int index = 0;
3505
3506
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3507
0
        for (uint32_t part = 0; part < numPU; part++)
3508
0
        {
3509
0
            MotionData* bestME = interMode.bestME[part];
3510
0
            for (int32_t i = 0; i < numPredDir; i++)
3511
0
                bestME[i].ref = m_reuseRef[refOffset + index++];
3512
0
        }
3513
0
    }
3514
3515
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3516
0
    {
3517
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3518
0
        for (uint32_t part = 0; part < numPU; part++)
3519
0
        {
3520
0
            MotionData* bestME = interMode.bestME[part];
3521
0
            for (int32_t i = 0; i < numPredDir; i++)
3522
0
            {
3523
0
                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3524
0
                bestME[i].ref = ref[cuGeom.absPartIdx];
3525
0
                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3526
0
                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3527
0
            }
3528
0
        }
3529
0
    }
3530
0
    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
3531
3532
    /* predInterSearch sets interMode.sa8dBits */
3533
0
    const Yuv& fencYuv = *interMode.fencYuv;
3534
0
    Yuv& predYuv = interMode.predYuv;
3535
0
    int part = partitionFromLog2Size(cuGeom.log2CUSize);
3536
0
    interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
3537
0
    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3538
0
    {
3539
0
        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
3540
0
        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
3541
0
    }
3542
0
    interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
3543
3544
0
    if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3545
0
    {
3546
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3547
0
        int index = 0;
3548
3549
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3550
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3551
0
        {
3552
0
            MotionData* bestME = interMode.bestME[puIdx];
3553
0
            for (int32_t i = 0; i < numPredDir; i++)
3554
0
                m_reuseRef[refOffset + index++] = bestME[i].ref;
3555
0
        }
3556
0
    }
3557
0
}
3558
3559
#if ENABLE_SCC_EXT
3560
void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2], MV* iMVCandList)
3561
#else
3562
void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3563
#endif
3564
0
{
3565
0
    interMode.initCosts();
3566
0
    interMode.cu.setPartSizeSubParts(partSize);
3567
0
    interMode.cu.setPredModeSubParts(MODE_INTER);
3568
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3569
3570
0
    if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3571
0
    {
3572
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3573
0
        int index = 0;
3574
3575
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3576
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3577
0
        {
3578
0
            MotionData* bestME = interMode.bestME[puIdx];
3579
0
            for (int32_t i = 0; i < numPredDir; i++)
3580
0
                bestME[i].ref = m_reuseRef[refOffset + index++];
3581
0
        }
3582
0
    }
3583
3584
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3585
0
    {
3586
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3587
0
        for (uint32_t part = 0; part < numPU; part++)
3588
0
        {
3589
0
            MotionData* bestME = interMode.bestME[part];
3590
0
            for (int32_t i = 0; i < numPredDir; i++)
3591
0
            {
3592
0
                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3593
0
                bestME[i].ref = ref[cuGeom.absPartIdx];
3594
0
                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3595
0
                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3596
0
            }
3597
0
        }
3598
0
    }
3599
3600
#if ENABLE_SCC_EXT
3601
    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask, iMVCandList);
3602
#else
3603
0
    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
3604
0
#endif
3605
3606
    /* predInterSearch sets interMode.sa8dBits, but this is ignored */
3607
0
    encodeResAndCalcRdInterCU(interMode, cuGeom);
3608
3609
0
    if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3610
0
    {
3611
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3612
0
        int index = 0;
3613
3614
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3615
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3616
0
        {
3617
0
            MotionData* bestME = interMode.bestME[puIdx];
3618
0
            for (int32_t i = 0; i < numPredDir; i++)
3619
0
                m_reuseRef[refOffset + index++] = bestME[i].ref;
3620
0
        }
3621
0
    }
3622
0
}
3623
3624
#if ENABLE_SCC_EXT
3625
void Analysis::checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList)
3626
{
3627
    intraBCMode.initCosts();
3628
    intraBCMode.cu.setPartSizeSubParts(ePartSize);
3629
    intraBCMode.cu.setPredModeSubParts(MODE_INTER);
3630
    intraBCMode.cu.setLumaIntraDirSubParts(DC_IDX, 0, cuGeom.depth);
3631
    intraBCMode.cu.setChromIntraDirSubParts(DC_IDX, 0, cuGeom.depth);
3632
    for (int i = 0; i < 2; i++)
3633
        intraBCMode.cu.m_lastIntraBCMv[i] = ibc.m_lastIntraBCMv[i];
3634
3635
    bool bValid = predIntraBCSearch(intraBCMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, ePartSize, testOnlyPred, bUse1DSearchFor8x8, ibc);
3636
    if (bValid)
3637
        encodeResAndCalcRdInterCU(intraBCMode, cuGeom);
3638
    else
3639
        intraBCMode.rdCost = UINT64_MAX;
3640
3641
    if (bValid && (intraBCMode.cu.m_log2CUSize[0] <= 4) && (intraBCMode.cu.m_partSize[0] == SIZE_2NxN || intraBCMode.cu.m_partSize[0] == SIZE_Nx2N))
3642
    {
3643
        int dummyWidth, dummyHeight;
3644
        uint32_t partAddr = 0;
3645
        intraBCMode.cu.getPartIndexAndSize(1, partAddr, dummyWidth, dummyHeight);
3646
        iMVCandList[0] = intraBCMode.cu.m_mv[0][0];
3647
        iMVCandList[1] = intraBCMode.cu.m_mv[0][partAddr];
3648
    }
3649
}
3650
#endif
3651
3652
void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
3653
0
{
3654
0
    CUData& cu = bidir2Nx2N.cu;
3655
3656
#if ENABLE_SCC_EXT
3657
    if ((cu.is8x8BipredRestriction(inter2Nx2N.bestME[0][0].mv, inter2Nx2N.bestME[0][1].mv, inter2Nx2N.bestME[0][0].ref, inter2Nx2N.bestME[0][1].ref) ? (1 << cu.m_log2CUSize[0] == 8) : cu.isBipredRestriction()) || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3658
#else
3659
0
    if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3660
0
#endif
3661
0
    {
3662
0
        bidir2Nx2N.sa8dCost = MAX_INT64;
3663
0
        bidir2Nx2N.rdCost = MAX_INT64;
3664
0
        return;
3665
0
    }
3666
3667
0
    const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
3668
0
    MV   mvzero(0, 0);
3669
0
    int  partEnum = cuGeom.log2CUSize - 2;
3670
3671
0
    bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
3672
0
    bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
3673
0
    MotionData* bestME = bidir2Nx2N.bestME[0];
3674
0
    int ref0    = bestME[0].ref;
3675
0
    MV  mvp0    = bestME[0].mvp;
3676
0
    int mvpIdx0 = bestME[0].mvpIdx;
3677
0
    int ref1    = bestME[1].ref;
3678
0
    MV  mvp1    = bestME[1].mvp;
3679
0
    int mvpIdx1 = bestME[1].mvpIdx;
3680
3681
0
    bidir2Nx2N.initCosts();
3682
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
3683
0
    cu.setPredModeSubParts(MODE_INTER);
3684
0
    cu.setPUInterDir(3, 0, 0);
3685
0
    cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
3686
0
    cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
3687
0
    cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3688
0
    cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3689
0
    cu.m_mergeFlag[0] = 0;
3690
3691
    /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
3692
0
    cu.setPUMv(0, bestME[0].mv, 0, 0);
3693
0
    cu.m_mvd[0][0] = bestME[0].mv - mvp0;
3694
3695
0
    cu.setPUMv(1, bestME[1].mv, 0, 0);
3696
0
    cu.m_mvd[1][0] = bestME[1].mv - mvp1;
3697
3698
0
    PredictionUnit pu(cu, cuGeom, 0);
3699
0
    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3700
3701
0
    int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
3702
0
    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3703
0
    {
3704
        /* Add in chroma distortion */
3705
0
        sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
3706
0
        sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
3707
0
    }
3708
0
    bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3709
0
    bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
3710
3711
0
    bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
3712
0
    if (bTryZero)
3713
0
    {
3714
        /* Do not try zero MV if unidir motion predictors are beyond
3715
         * valid search area */
3716
0
        MV mvmin, mvmax;
3717
0
        int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
3718
0
        setSearchRange(cu, mvzero, merange, mvmin, mvmax);
3719
0
        mvmax.y += 2; // there is some pad for subpel refine
3720
0
        mvmin <<= 2;
3721
0
        mvmax <<= 2;
3722
3723
0
        bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
3724
0
        bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
3725
0
    }
3726
0
    if (bTryZero)
3727
0
    {
3728
        /* Estimate cost of BIDIR using coincident blocks */
3729
0
        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3730
3731
0
        int zsa8d;
3732
3733
0
        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3734
0
        {
3735
0
            cu.m_mv[0][0] = mvzero;
3736
0
            cu.m_mv[1][0] = mvzero;
3737
3738
0
            motionCompensation(cu, pu, tmpPredYuv, true, true);
3739
0
            zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3740
0
            zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
3741
0
            zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
3742
3743
0
        }
3744
0
        else
3745
0
        {
3746
0
            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3747
0
            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3748
0
            intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
3749
0
            primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
3750
0
            zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3751
0
        }
3752
0
        uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
3753
0
        uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
3754
0
        uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
3755
3756
        /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
3757
0
        mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
3758
0
        mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
3759
3760
0
        uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3761
0
        zcost = zsa8d + m_rdCost.getCost(zbits);
3762
3763
0
        if (zcost < bidir2Nx2N.sa8dCost)
3764
0
        {
3765
0
            bidir2Nx2N.sa8dBits = zbits;
3766
0
            bidir2Nx2N.sa8dCost = zcost;
3767
3768
0
            cu.setPUMv(0, mvzero, 0, 0);
3769
0
            cu.m_mvd[0][0] = mvzero - mvp0;
3770
0
            cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3771
3772
0
            cu.setPUMv(1, mvzero, 0, 0);
3773
0
            cu.m_mvd[1][0] = mvzero - mvp1;
3774
0
            cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3775
3776
0
            if (m_bChromaSa8d) /* real MC was already performed */
3777
0
                bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
3778
0
            else
3779
0
                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400);
3780
0
        }
3781
0
        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3782
0
        {
3783
            /* recover overwritten motion vectors */
3784
0
            cu.m_mv[0][0] = bestME[0].mv;
3785
0
            cu.m_mv[1][0] = bestME[1].mv;
3786
0
        }
3787
0
    }
3788
0
}
3789
3790
void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
3791
0
{
3792
0
    if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth)
3793
0
    {
3794
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
3795
0
        {
3796
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
3797
0
            if (childGeom.flags & CUGeom::PRESENT)
3798
0
                encodeResidue(ctu, childGeom);
3799
0
        }
3800
0
        return;
3801
0
    }
3802
3803
0
    uint32_t absPartIdx = cuGeom.absPartIdx;
3804
0
    int sizeIdx = cuGeom.log2CUSize - 2;
3805
3806
    /* reuse the bestMode data structures at the current depth */
3807
0
    Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
3808
0
    CUData& cu = bestMode->cu;
3809
3810
0
    cu.copyFromPic(ctu, cuGeom, m_csp);
3811
3812
0
    PicYuv& reconPic = *m_frame->m_reconPic[0];
3813
3814
0
    Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
3815
0
    if (cuGeom.depth)
3816
0
        m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
3817
0
    X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
3818
3819
0
    if (cu.isIntra(0))
3820
0
    {
3821
0
        ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
3822
        
3823
0
        uint32_t tuDepthRange[2];
3824
0
        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
3825
3826
0
        residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
3827
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3828
0
        {
3829
0
            getBestIntraModeChroma(*bestMode, cuGeom);
3830
0
            residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
3831
0
        }
3832
0
    }
3833
0
    else // if (cu.isInter(0))
3834
0
    {
3835
0
        ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
3836
3837
0
        X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
3838
3839
        /* Calculate residual for current CU part into depth sized resiYuv */
3840
3841
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
3842
3843
        /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
3844
0
        Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
3845
0
        pixel* predY = predYuv.getLumaAddr(absPartIdx);
3846
3847
0
        primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
3848
0
                                      fencYuv.m_buf[0], predY,
3849
0
                                      fencYuv.m_size, predYuv.m_size);
3850
3851
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3852
0
        {
3853
0
            pixel* predU = predYuv.getCbAddr(absPartIdx);
3854
0
            pixel* predV = predYuv.getCrAddr(absPartIdx);
3855
0
            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
3856
0
                                                 fencYuv.m_buf[1], predU,
3857
0
                                                 fencYuv.m_csize, predYuv.m_csize);
3858
3859
0
            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
3860
0
                                                 fencYuv.m_buf[2], predV,
3861
0
                                                 fencYuv.m_csize, predYuv.m_csize);
3862
0
        }
3863
3864
0
        uint32_t tuDepthRange[2];
3865
0
        cu.getInterTUQtDepthRange(tuDepthRange, 0);
3866
3867
0
        residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
3868
3869
0
        if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
3870
0
            cu.setPredModeSubParts(MODE_SKIP);
3871
3872
        /* residualTransformQuantInter() wrote transformed residual back into
3873
         * resiYuv. Generate the recon pixels by adding it to the prediction */
3874
3875
0
        if (cu.m_cbf[0][0])
3876
0
        {
3877
0
            bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
3878
0
            bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
3879
0
            primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
3880
0
                (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
3881
0
        }
3882
0
        else
3883
0
            primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
3884
0
                                           predY, predYuv.m_size);
3885
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3886
0
        {
3887
0
             pixel* predU = predYuv.getCbAddr(absPartIdx);
3888
0
             pixel* predV = predYuv.getCrAddr(absPartIdx);
3889
0
             if (cu.m_cbf[1][0])
3890
0
             {
3891
0
                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3892
0
                 bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3893
0
                 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3894
0
                     (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
3895
0
             }
3896
0
            else
3897
0
                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3898
0
                                                         predU, predYuv.m_csize);
3899
3900
0
            if (cu.m_cbf[2][0])
3901
0
            {
3902
0
                bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3903
0
                bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3904
0
                primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3905
0
                    (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
3906
0
            }
3907
0
            else
3908
0
                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3909
0
                                                         predV, predYuv.m_csize);
3910
0
        }
3911
0
    }
3912
3913
0
    cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp);
3914
0
}
3915
3916
void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
3917
0
{
3918
0
    if (m_param->rdLevel >= 3)
3919
0
    {
3920
        /* code the split flag (0 or 1) and update bit costs */
3921
0
        mode.contexts.resetBits();
3922
0
        mode.contexts.codeSplitFlag(mode.cu, 0, depth);
3923
0
        uint32_t bits = mode.contexts.getNumberOfWrittenBits();
3924
0
        mode.totalBits += bits;
3925
0
        updateModeCost(mode);
3926
0
    }
3927
0
    else if (m_param->rdLevel <= 1)
3928
0
    {
3929
0
        mode.sa8dBits++;
3930
0
        mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
3931
0
    }
3932
0
    else
3933
0
    {
3934
0
        mode.totalBits++;
3935
0
        updateModeCost(mode);
3936
0
    }
3937
0
}
3938
3939
uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
3940
0
{
3941
    /* Do not attempt to code a block larger than the largest block in the
3942
     * co-located CTUs in L0 and L1 */
3943
0
    int currentQP = parentCTU.m_qp[0];
3944
0
    int previousQP = currentQP;
3945
0
    uint32_t minDepth0 = 4, minDepth1 = 4;
3946
0
    uint32_t sum = 0;
3947
0
    int numRefs = 0;
3948
0
    int refPresent = (!m_slice->m_param->bEnableSCC && m_slice->m_numRefIdx[0]) || ((!m_slice->m_param->bEnableSCC && (m_slice->m_numRefIdx[0] - 1)));
3949
0
    if (refPresent)
3950
0
    {
3951
0
        numRefs++;
3952
0
        const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3953
0
        previousQP = cu.m_qp[0];
3954
0
        if (!cu.m_cuDepth[cuGeom.absPartIdx])
3955
0
            return 0;
3956
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3957
0
        {
3958
0
            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3959
0
            minDepth0 = X265_MIN(d, minDepth0);
3960
0
            sum += d;
3961
0
        }
3962
0
    }
3963
0
    if (m_slice->m_numRefIdx[1])
3964
0
    {
3965
0
        numRefs++;
3966
0
        const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3967
0
        if (!cu.m_cuDepth[cuGeom.absPartIdx])
3968
0
            return 0;
3969
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3970
0
        {
3971
0
            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3972
0
            minDepth1 = X265_MIN(d, minDepth1);
3973
0
            sum += d;
3974
0
        }
3975
0
    }
3976
0
    if (!numRefs)
3977
0
        return 0;
3978
3979
0
    uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
3980
0
    uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
3981
3982
    /* allow block size growth if QP is raising or avg depth is
3983
     * less than 1.5 of min depth */
3984
0
    if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
3985
0
        minDepth -= 1;
3986
3987
0
    return minDepth;
3988
0
}
3989
3990
/* returns true if recursion should be stopped */
3991
bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
3992
0
{
3993
    /* early exit when the RD cost of best mode at depth n is less than the sum
3994
     * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
3995
     * left, colocated) and avg cost of that CU at depth "n" with weightage for
3996
     * each quantity */
3997
3998
0
    uint32_t depth = cuGeom.depth;
3999
0
    FrameData& curEncData = *m_frame->m_encData;
4000
0
    FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
4001
0
    uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
4002
0
    uint64_t cuCount = cuStat.count[depth];
4003
4004
0
    uint64_t neighCost = 0, neighCount = 0;
4005
0
    const CUData* above = parentCTU.m_cuAbove;
4006
0
    if (above)
4007
0
    {
4008
0
        FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
4009
0
        neighCost += astat.avgCost[depth] * astat.count[depth];
4010
0
        neighCount += astat.count[depth];
4011
4012
0
        const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
4013
0
        if (aboveLeft)
4014
0
        {
4015
0
            FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
4016
0
            neighCost += lstat.avgCost[depth] * lstat.count[depth];
4017
0
            neighCount += lstat.count[depth];
4018
0
        }
4019
4020
0
        const CUData* aboveRight = parentCTU.m_cuAboveRight;
4021
0
        if (aboveRight)
4022
0
        {
4023
0
            FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
4024
0
            neighCost += rstat.avgCost[depth] * rstat.count[depth];
4025
0
            neighCount += rstat.count[depth];
4026
0
        }
4027
0
    }
4028
0
    const CUData* left = parentCTU.m_cuLeft;
4029
0
    if (left)
4030
0
    {
4031
0
        FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
4032
0
        neighCost += nstat.avgCost[depth] * nstat.count[depth];
4033
0
        neighCount += nstat.count[depth];
4034
0
    }
4035
4036
    // give 60% weight to all CU's and 40% weight to neighbour CU's
4037
0
    if (neighCount + cuCount)
4038
0
    {
4039
0
        uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
4040
0
        uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
4041
0
        if (curCost < avgCost && avgCost)
4042
0
            return true;
4043
0
    }
4044
4045
0
    return false;
4046
0
}
4047
4048
bool Analysis::complexityCheckCU(const Mode& bestMode)
4049
0
{
4050
0
    if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
4051
0
    {
4052
0
        uint32_t mean = 0;
4053
0
        uint32_t homo = 0;
4054
0
        uint32_t cuSize = bestMode.fencYuv->m_size;
4055
0
        for (uint32_t y = 0; y < cuSize; y++) {
4056
0
            for (uint32_t x = 0; x < cuSize; x++) {
4057
0
                mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
4058
0
            }
4059
0
        }
4060
0
        mean = mean / (cuSize * cuSize);
4061
0
        for (uint32_t y = 0; y < cuSize; y++) {
4062
0
            for (uint32_t x = 0; x < cuSize; x++) {
4063
0
                homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
4064
0
            }
4065
0
        }
4066
0
        homo = homo / (cuSize * cuSize);
4067
4068
0
        if (homo < (.1 * mean))
4069
0
            return true;
4070
4071
0
        return false;
4072
0
    }
4073
0
    else
4074
0
    {
4075
0
        int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
4076
0
        int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
4077
0
        intptr_t stride = m_frame->m_fencPic->m_stride;
4078
0
        intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
4079
0
        uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
4080
0
        uint32_t sum = (uint32_t)sum_ss;
4081
0
        uint32_t ss = (uint32_t)(sum_ss >> 32);
4082
0
        uint32_t pixelCount = 1 << shift;
4083
0
        double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
4084
4085
0
        if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
4086
0
            return false;
4087
0
        else
4088
0
            return true;
4089
0
    }
4090
0
 }
4091
4092
uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
4093
0
{
4094
0
    uint32_t cuVariance = 0;
4095
0
    uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
4096
0
    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
4097
4098
0
    uint32_t width = m_frame->m_fencPic->m_picWidth;
4099
0
    uint32_t height = m_frame->m_fencPic->m_picHeight;
4100
0
    uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
4101
0
    uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
4102
0
    uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
4103
0
    uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
4104
0
    uint32_t cnt = 0; 
4105
4106
0
    for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
4107
0
    {
4108
0
        for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
4109
0
        {
4110
0
            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
4111
0
            cuVariance += blockVariance[idx];
4112
0
            cnt++;
4113
0
        }
4114
0
    }
4115
0
    return cuVariance / cnt;
4116
0
}
4117
4118
double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom)
4119
0
{
4120
0
    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
4121
0
    PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
4122
4123
0
    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth;
4124
0
    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight;
4125
4126
0
    uint32_t aqStride = pQPLayer->numAQPartInWidth;
4127
4128
0
    double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX];
4129
0
    return dQpOffset;
4130
0
}
4131
4132
double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom)
4133
0
{
4134
0
    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
4135
0
    PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
4136
4137
0
    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth;
4138
0
    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight;
4139
4140
0
    uint32_t aqStride = pcAQLayer->numAQPartInWidth;
4141
4142
0
    double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX];
4143
0
    return dQpOffset;
4144
0
}
4145
4146
int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
4147
0
{
4148
0
    FrameData& curEncData = *m_frame->m_encData;
4149
0
    double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
4150
0
    bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
4151
4152
0
    if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && strlen(m_param->analysisLoad)))
4153
0
    {
4154
0
        x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
4155
0
        if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
4156
0
            && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
4157
0
            qp += distortionData->offset[ctu.m_cuAddr];
4158
0
    }
4159
4160
0
    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
4161
0
    {
4162
0
        int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
4163
0
        if (ctu.m_slice->m_sliceType == I_SLICE)
4164
0
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
4165
0
        else
4166
0
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
4167
0
    }
4168
0
    if (m_param->rc.hevcAq)
4169
0
    {
4170
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
4171
0
        double dQpOffset = 0;
4172
0
        if (bCuTreeOffset)
4173
0
        {
4174
0
            dQpOffset = cuTreeQPOffset(ctu, cuGeom);
4175
0
        }
4176
0
        else
4177
0
        {
4178
0
            dQpOffset = aqQPOffset(ctu, cuGeom);
4179
0
            if (complexCheck)
4180
0
            {
4181
0
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
4182
0
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
4183
0
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
4184
0
                return (offset < max_threshold);
4185
0
            }
4186
0
        }
4187
0
        qp += dQpOffset;
4188
0
    }
4189
0
    else
4190
0
    {
4191
0
        int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
4192
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
4193
0
        double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
4194
0
        if (qpoffs)
4195
0
        {
4196
0
            uint32_t width = m_frame->m_fencPic->m_picWidth;
4197
0
            uint32_t height = m_frame->m_fencPic->m_picHeight;
4198
0
            uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
4199
0
            uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
4200
0
            uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
4201
0
            uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
4202
0
            double dQpOffset = 0;
4203
0
            uint32_t cnt = 0;
4204
0
            for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
4205
0
            {
4206
0
                for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
4207
0
                {
4208
0
                    uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
4209
0
                    dQpOffset += qpoffs[idx];
4210
0
                    cnt++;
4211
0
                }
4212
0
            }
4213
0
            dQpOffset /= cnt;
4214
0
            qp += dQpOffset;
4215
0
            if (complexCheck)
4216
0
            {
4217
0
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
4218
0
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
4219
0
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
4220
0
                return (offset < max_threshold);
4221
0
            }
4222
0
        }
4223
0
    }
4224
4225
0
    return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
4226
0
}
4227
4228
void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype)
4229
0
{
4230
0
    static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
4231
0
    static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
4232
0
    int shift = (X265_DEPTH - 8);
4233
4234
0
    double s = 1 + 0.005 * qp;
4235
4236
    // Calculate denominator of normalization factor
4237
0
    uint64_t fDc_den = 0, fAc_den = 0;
4238
4239
    // 1. Calculate dc component
4240
0
    uint64_t z_o = 0;
4241
0
    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4)
4242
0
    {
4243
0
        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4)
4244
0
        {
4245
0
            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
4246
0
            z_o += temp * temp; // 2 * (Z(0)) pow(2)
4247
0
        }
4248
0
    }
4249
0
    fDc_den = (2 * z_o)  + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1
4250
0
    fDc_den /= ((blockSize >> 2) * (blockSize >> 2));
4251
4252
    // 2. Calculate ac component
4253
0
    uint64_t z_k = 0;
4254
0
    int block = (int)(((log(blockSize) / log(2)) - 2) + 0.5);
4255
0
    primitives.cu[block].normFact(src, blockSize, shift, &z_k);
4256
4257
    // Remove the DC part
4258
0
    z_k -= z_o;
4259
4260
0
    fAc_den = z_k + int(s * z_k) + ssim_c2;
4261
0
    fAc_den /= ((blockSize >> 2) * (blockSize >> 2));
4262
4263
0
    ctu.m_fAc_den[ttype] = fAc_den;
4264
0
    ctu.m_fDc_den[ttype] = fDc_den;
4265
0
}
4266
4267
void Analysis::calculateNormFactor(CUData& ctu, int qp)
4268
0
{
4269
0
    const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0];
4270
0
    uint32_t blockSize = m_modeDepth[0].fencYuv.m_size;
4271
4272
0
    normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA);
4273
4274
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4275
0
    {
4276
0
        const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1];
4277
0
        const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2];
4278
0
        uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize;
4279
4280
0
        normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U);
4281
0
        normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V);
4282
0
    }
4283
0
}
4284
4285
int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom)
4286
0
{
4287
0
    int sameContentRef = 0;
4288
0
    int m_curPoc = parentCTU.m_slice->m_poc;
4289
0
    int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx];
4290
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
4291
0
    for (int list = 0; list < numPredDir; list++)
4292
0
    {
4293
0
        for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
4294
0
        {
4295
0
            int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc;
4296
#if ENABLE_SCC_EXT
4297
            if (refPoc == m_curPoc)
4298
                continue;
4299
#endif
4300
0
            int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx];
4301
0
            if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE)))
4302
0
                sameContentRef++;    /* Content changed */
4303
0
        }
4304
0
    }
4305
0
    return sameContentRef;
4306
0
}