Coverage Report

Created: 2022-08-24 06:17

/src/x265/source/encoder/analysis.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
*          Steve Borho <steve@borho.org>
6
*          Min Chen <chenm003@163.com>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at license @ x265.com.
24
*****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "picyuv.h"
30
#include "primitives.h"
31
#include "threading.h"
32
33
#include "analysis.h"
34
#include "rdcost.h"
35
#include "encoder.h"
36
37
using namespace X265_NS;
38
39
/* An explanation of rate distortion levels (--rd-level)
40
 *
41
 * rd-level 0 generates no recon per CU (NO RDO or Quant)
42
 *
43
 *   sa8d selection between merge / skip / inter / intra and split
44
 *   no recon pixels generated until CTU analysis is complete, requiring
45
 *   intra predictions to use source pixels
46
 *
47
 * rd-level 1 uses RDO for merge and skip, sa8d for all else
48
 *
49
 *   RDO selection between merge and skip
50
 *   sa8d selection between (merge/skip) / inter modes / intra and split
51
 *   intra prediction uses reconstructed pixels
52
 *
53
 * rd-level 2 uses RDO for merge/skip and split
54
 *
55
 *   RDO selection between merge and skip
56
 *   sa8d selection between (merge/skip) / inter modes / intra
57
 *   RDO split decisions
58
 *
59
 * rd-level 3 uses RDO for merge/skip/best inter/intra
60
 *
61
 *   RDO selection between merge and skip
62
 *   sa8d selection of best inter mode
63
 *   sa8d decisions include chroma residual cost
64
 *   RDO selection between (merge/skip) / best inter mode / intra / split
65
 *
66
 * rd-level 4 enables RDOQuant
67
 *   chroma residual cost included in satd decisions, including subpel refine
68
 *    (as a result of --subme 3 being used by preset slow)
69
 *
70
 * rd-level 5,6 does RDO for each inter mode
71
 */
72
73
Analysis::Analysis()
74
0
{
75
0
    m_reuseInterDataCTU = NULL;
76
0
    m_reuseRef = NULL;
77
0
    m_bHD = false;
78
0
    m_modeFlag[0] = false;
79
0
    m_modeFlag[1] = false;
80
0
    m_checkMergeAndSkipOnly[0] = false;
81
0
    m_checkMergeAndSkipOnly[1] = false;
82
0
    m_evaluateInter = 0;
83
0
}
84
85
bool Analysis::create(ThreadLocalData *tld)
86
0
{
87
0
    m_tld = tld;
88
0
    m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
89
90
0
    int costArrSize = 1;
91
0
    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
92
0
    for (uint32_t i = 1; i <= maxDQPDepth; i++)
93
0
        costArrSize += (1 << (i * 2));
94
0
    cacheCost = X265_MALLOC(uint64_t, costArrSize);
95
96
0
    int csp = m_param->internalCsp;
97
0
    uint32_t cuSize = m_param->maxCUSize;
98
99
0
    bool ok = true;
100
0
    for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1)
101
0
    {
102
0
        ModeDepth &md = m_modeDepth[depth];
103
0
        ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param);
104
0
        ok &= md.fencYuv.create(cuSize, csp);
105
0
        if (ok)
106
0
        {
107
0
            for (int j = 0; j < MAX_PRED_TYPES; j++)
108
0
            {
109
0
                md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j);
110
0
                ok &= md.pred[j].predYuv.create(cuSize, csp);
111
0
                ok &= md.pred[j].reconYuv.create(cuSize, csp);
112
0
                md.pred[j].fencYuv = &md.fencYuv;
113
0
            }
114
0
        }
115
0
    }
116
0
    if (m_param->sourceHeight >= 1080)
117
0
        m_bHD = true;
118
119
0
    return ok;
120
0
}
121
122
void Analysis::destroy()
123
0
{
124
0
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
125
0
    {
126
0
        m_modeDepth[i].cuMemPool.destroy();
127
0
        m_modeDepth[i].fencYuv.destroy();
128
129
0
        for (int j = 0; j < MAX_PRED_TYPES; j++)
130
0
        {
131
0
            m_modeDepth[i].pred[j].predYuv.destroy();
132
0
            m_modeDepth[i].pred[j].reconYuv.destroy();
133
0
        }
134
0
    }
135
0
    X265_FREE(cacheCost);
136
0
}
137
138
Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
139
0
{
140
0
    m_slice = ctu.m_slice;
141
0
    m_frame = &frame;
142
0
    m_bChromaSa8d = m_param->rdLevel >= 3;
143
0
    m_param = m_frame->m_param;
144
145
#if _DEBUG || CHECKED_BUILD
146
    invalidateContexts(0);
147
#endif
148
149
0
    int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
150
0
    ctu.setQPSubParts((int8_t)qp, 0, 0);
151
152
0
    m_rqt[0].cur.load(initialContext);
153
0
    ctu.m_meanQP = initialContext.m_meanQP;
154
0
    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
155
156
0
    if (m_param->bSsimRd)
157
0
        calculateNormFactor(ctu, qp);
158
159
0
    uint32_t numPartition = ctu.m_numPartitions;
160
0
    if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr))
161
0
    {
162
0
        x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr;
163
0
        int32_t depthIdx = 0;
164
0
        uint32_t maxNum8x8Partitions = 64;
165
0
        uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr];
166
0
        uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
167
0
        int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
168
0
        do
169
0
        {
170
0
            uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx];
171
0
            uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx));
172
0
            int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx];
173
0
            memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth);
174
0
            memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth);
175
0
            memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth);
176
0
            for (uint32_t l = 0; l < numPartition >> 2 * depth; l++)
177
0
                prevCtuInfoChangePtr[l] = prevCtuInfoChange;
178
0
            depthInfoPtr += ctu.m_numPartitions >> 2 * depth;
179
0
            contentInfoPtr += ctu.m_numPartitions >> 2 * depth;
180
0
            prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth;
181
0
            depthIdx++;
182
0
        } while (ctuTemp->ctuPartitions[depthIdx] != 0);
183
184
0
        m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
185
0
        m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
186
0
        memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition);
187
        //Calculate log2CUSize from depth
188
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
189
0
            ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
190
0
    }
191
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
192
0
    {
193
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
194
0
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
195
0
        for (int dir = 0; dir < numPredDir; dir++)
196
0
        {
197
0
            m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
198
0
            m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
199
0
        }
200
0
        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
201
0
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
202
0
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
203
0
    }
204
    
205
0
    int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel);
206
0
    if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10)
207
0
    {
208
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
209
0
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
210
0
        if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) ||
211
0
            ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7)))
212
0
            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
213
0
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
214
0
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
215
0
        if (reuseLevel > 4)
216
0
        {
217
0
            m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
218
0
            m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
219
0
        }
220
0
        if (m_param->analysisSave && !m_param->analysisLoad)
221
0
            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
222
0
                m_reuseRef[i] = -1;
223
0
    }
224
0
    ProfileCUScope(ctu, totalCTUTime, totalCTUs);
225
226
0
    if (m_slice->m_sliceType == I_SLICE)
227
0
    {
228
0
        x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
229
0
        if (m_param->analysisLoadReuseLevel > 1)
230
0
        {
231
0
            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
232
0
            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
233
0
            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
234
0
            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
235
0
        }
236
0
        compressIntraCU(ctu, cuGeom, qp);
237
0
    }
238
0
    else
239
0
    {
240
0
        bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16));
241
0
        bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4);
242
0
        bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
243
0
        bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6;
244
245
0
        if (bCopyAnalysis)
246
0
        {
247
0
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
248
0
            int posCTU = ctu.m_cuAddr * numPartition;
249
0
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
250
0
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
251
0
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
252
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
253
0
                memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);
254
255
0
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
256
0
            {
257
0
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
258
0
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
259
0
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
260
0
            }
261
            //Calculate log2CUSize from depth
262
0
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
263
0
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
264
0
        }
265
266
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
267
0
            ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
268
0
            && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
269
0
            compressIntraCU(ctu, cuGeom, qp);
270
0
        else if (!m_param->rdLevel)
271
0
        {
272
            /* In RD Level 0/1, copy source pixels into the reconstructed block so
273
             * they are available for intra predictions */
274
0
            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
275
276
0
            compressInterCU_rd0_4(ctu, cuGeom, qp);
277
278
            /* generate residual for entire CTU at once and copy to reconPic */
279
0
            encodeResidue(ctu, cuGeom);
280
0
        }
281
0
        else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) ||
282
0
                ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16))
283
0
        {
284
0
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
285
0
            int posCTU = ctu.m_cuAddr * numPartition;
286
0
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
287
0
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
288
0
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
289
0
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
290
0
            {
291
0
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
292
0
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
293
0
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
294
0
            }
295
            //Calculate log2CUSize from depth
296
0
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
297
0
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
298
299
0
            qprdRefine (ctu, cuGeom, qp, qp);
300
0
            return *m_modeDepth[0].bestMode;
301
0
        }
302
0
        else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
303
0
            compressInterCU_dist(ctu, cuGeom, qp);
304
0
        else if (m_param->rdLevel <= 4)
305
0
            compressInterCU_rd0_4(ctu, cuGeom, qp);
306
0
        else
307
0
            compressInterCU_rd5_6(ctu, cuGeom, qp);
308
0
    }
309
310
0
    if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
311
0
        qprdRefine(ctu, cuGeom, qp, qp);
312
313
0
    if (m_param->csvLogLevel >= 2)
314
0
        collectPUStatistics(ctu, cuGeom);
315
316
0
    return *m_modeDepth[0].bestMode;
317
0
}
318
319
void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom)
320
0
{
321
0
    uint8_t depth = 0;
322
0
    uint8_t partSize = 0;
323
0
    for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
324
0
    {
325
0
        depth = ctu.m_cuDepth[absPartIdx];
326
0
        partSize = ctu.m_partSize[absPartIdx];
327
0
        uint32_t numPU = nbPartsTable[(int)partSize];
328
0
        int shift = 2 * (m_param->maxCUDepth + 1 - depth);
329
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
330
0
        {
331
0
            PredictionUnit pu(ctu, cuGeom, puIdx);
332
0
            int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx);
333
0
            int mode = 1;
334
0
            if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN)
335
0
                mode = 2;
336
0
            else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N)
337
0
                 mode = 3;
338
0
            if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP)
339
0
            {
340
0
                ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift;
341
0
                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
342
0
            }
343
0
            else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA)
344
0
            {
345
0
                if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN)
346
0
                {
347
0
                    ctu.m_encData->m_frameStats.cnt4x4++;
348
0
                    ctu.m_encData->m_frameStats.totalPu[4]++;
349
0
                }
350
0
                else
351
0
                {
352
0
                    ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift;
353
0
                    ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
354
0
                }
355
0
            }
356
0
            else if (mode == 3)
357
0
            {
358
0
                ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift;
359
0
                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
360
0
                break;
361
0
            }
362
0
            else
363
0
            {
364
0
                if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx])
365
0
                    ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
366
0
                else
367
0
                    ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
368
369
0
                ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode;
370
0
            }
371
0
        }
372
0
    }
373
0
}
374
375
int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
376
0
{
377
0
    float predDepth = 0;
378
0
    CUData* neighbourCU;
379
0
    uint8_t count = 0;
380
0
    int32_t maxTUDepth = -1;
381
0
    neighbourCU = &m_slice->m_refFrameList[0][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
382
0
    predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
383
0
    count++;
384
0
    if (m_slice->isInterB())
385
0
    {
386
0
        neighbourCU = &m_slice->m_refFrameList[1][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
387
0
        predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
388
0
        count++;
389
0
    }
390
0
    if (parentCTU.m_cuAbove)
391
0
    {
392
0
        predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
393
0
        count++;
394
0
        if (parentCTU.m_cuAboveLeft)
395
0
        {
396
0
            predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
397
0
            count++;
398
0
        }
399
0
        if (parentCTU.m_cuAboveRight)
400
0
        {
401
0
            predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
402
0
            count++;
403
0
        }
404
0
    }
405
0
    if (parentCTU.m_cuLeft)
406
0
    {
407
0
        predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
408
0
        count++;
409
0
    }
410
0
    predDepth /= count;
411
412
0
    if (predDepth == 0)
413
0
        maxTUDepth = 0;
414
0
    else if (predDepth < 1)
415
0
        maxTUDepth = 1;
416
0
    else if (predDepth >= 1 && predDepth <= 1.5)
417
0
        maxTUDepth = 2;
418
0
    else if (predDepth > 1.5 && predDepth <= 2.5)
419
0
        maxTUDepth = 3;
420
0
    else
421
0
        maxTUDepth = -1;
422
423
0
    return maxTUDepth;
424
0
}
425
426
void Analysis::tryLossless(const CUGeom& cuGeom)
427
0
{
428
0
    ModeDepth& md = m_modeDepth[cuGeom.depth];
429
430
0
    if (!md.bestMode->distortion)
431
        /* already lossless */
432
0
        return;
433
0
    else if (md.bestMode->cu.isIntra(0))
434
0
    {
435
0
        md.pred[PRED_LOSSLESS].initCosts();
436
0
        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
437
0
        PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
438
0
        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
439
0
        checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
440
0
    }
441
0
    else
442
0
    {
443
0
        md.pred[PRED_LOSSLESS].initCosts();
444
0
        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
445
0
        md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
446
0
        encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
447
0
        checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
448
0
    }
449
0
}
450
451
void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
452
0
{
453
0
    uint32_t depth = cuGeom.depth;
454
0
    ModeDepth& md = m_modeDepth[depth];
455
0
    md.bestMode = NULL;
456
457
0
    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
458
459
0
    int bestCUQP = qp;
460
0
    int lambdaQP = lqp;
461
0
    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
462
0
    if (m_param->analysisLoadReuseLevel >= 7)
463
0
        doQPRefine = false;
464
0
    if (doQPRefine)
465
0
    {
466
0
        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
467
468
0
        int cuIdx = (cuGeom.childOffset - 1) / 3;
469
0
        bestCUCost = origCUCost = cacheCost[cuIdx];
470
471
0
        int direction = m_param->bOptCUDeltaQP ? 1 : 2;
472
473
0
        for (int dir = direction; dir >= -direction; dir -= (direction * 2))
474
0
        {
475
0
            if (m_param->bOptCUDeltaQP && ((dir != 1) || ((qp + 3) >= (int32_t)parentCTU.m_meanQP)))
476
0
                break;
477
478
0
            int threshold = 1;
479
0
            int failure = 0;
480
0
            cuPrevCost = origCUCost;
481
482
0
            int modCUQP = qp + dir;
483
0
            while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC)
484
0
            {
485
0
                if (m_param->bOptCUDeltaQP && modCUQP > (int32_t)parentCTU.m_meanQP)
486
0
                    break;
487
488
0
                recodeCU(parentCTU, cuGeom, modCUQP, qp);
489
0
                cuCost = md.bestMode->rdCost;
490
491
0
                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
492
0
                if (cuCost < cuPrevCost)
493
0
                    failure = 0;
494
0
                else
495
0
                    failure++;
496
497
0
                if (failure > threshold)
498
0
                    break;
499
500
0
                cuPrevCost = cuCost;
501
0
                modCUQP += dir;
502
0
            }
503
0
        }
504
0
        lambdaQP = bestCUQP;
505
0
    }
506
507
0
    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
508
509
    /* Copy best data to encData CTU and recon */
510
0
    md.bestMode->cu.copyToPic(depth);
511
0
    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
512
0
}
513
514
uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
515
0
{
516
0
    uint32_t depth = cuGeom.depth;
517
0
    ModeDepth& md = m_modeDepth[depth];
518
0
    md.bestMode = NULL;
519
520
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
521
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
522
523
0
    bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
524
0
    bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
525
0
    int split = 0;
526
0
    if (m_param->intraRefine && m_param->intraRefine != 4)
527
0
    {
528
0
        split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
529
0
            ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
530
0
        if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
531
0
            bAlreadyDecided = false;
532
0
    }
533
534
0
    if (bAlreadyDecided)
535
0
    {
536
0
        if (bDecidedDepth && mightNotSplit)
537
0
        {
538
0
            Mode& mode = md.pred[0];
539
0
            md.bestMode = &mode;
540
0
            mode.cu.initSubCU(parentCTU, cuGeom, qp);
541
0
            bool reuseModes = !((m_param->intraRefine == 3) ||
542
0
                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
543
0
            if (reuseModes)
544
0
            {
545
0
                memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
546
0
                memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
547
0
            }
548
0
            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
549
550
0
            if (m_bTryLossless)
551
0
                tryLossless(cuGeom);
552
553
0
            if (mightSplit)
554
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
555
0
        }
556
0
    }
557
0
    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
558
0
    {
559
0
        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
560
0
        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
561
0
        checkBestMode(md.pred[PRED_INTRA], depth);
562
563
0
        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
564
0
        {
565
0
            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
566
0
            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
567
0
            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
568
0
        }
569
570
0
        if (m_bTryLossless)
571
0
            tryLossless(cuGeom);
572
573
0
        if (mightSplit)
574
0
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
575
0
    }
576
577
    // stop recursion if we reach the depth of previous analysis decision
578
0
    mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
579
580
0
    if (mightSplit)
581
0
    {
582
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
583
0
        splitPred->initCosts();
584
0
        CUData* splitCU = &splitPred->cu;
585
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
586
587
0
        uint32_t nextDepth = depth + 1;
588
0
        ModeDepth& nd = m_modeDepth[nextDepth];
589
0
        invalidateContexts(nextDepth);
590
0
        Entropy* nextContext = &m_rqt[depth].cur;
591
0
        int32_t nextQP = qp;
592
0
        uint64_t curCost = 0;
593
0
        int skipSplitCheck = 0;
594
595
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
596
0
        {
597
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
598
0
            if (childGeom.flags & CUGeom::PRESENT)
599
0
            {
600
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
601
0
                m_rqt[nextDepth].cur.load(*nextContext);
602
603
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
604
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
605
606
0
                if (m_param->bEnableSplitRdSkip)
607
0
                {
608
0
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP);
609
0
                    if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
610
0
                    {
611
0
                        skipSplitCheck = 1;
612
0
                        break;
613
0
                    }
614
0
                }
615
0
                else
616
0
                    compressIntraCU(parentCTU, childGeom, nextQP);
617
618
                // Save best CU and pred data for this sub CU
619
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
620
0
                splitPred->addSubCosts(*nd.bestMode);
621
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
622
0
                nextContext = &nd.bestMode->contexts;
623
0
            }
624
0
            else
625
0
            {
626
                /* record the depth of this non-present sub-CU */
627
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
628
629
                /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
630
0
                if (bAlreadyDecided)
631
0
                    memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
632
0
            }
633
0
        }
634
0
        if (!skipSplitCheck)
635
0
        {
636
0
            nextContext->store(splitPred->contexts);
637
0
            if (mightNotSplit)
638
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
639
0
            else
640
0
                updateModeCost(*splitPred);
641
642
0
            checkDQPForSplitPred(*splitPred, cuGeom);
643
0
            checkBestMode(*splitPred, depth);
644
0
        }
645
0
    }
646
647
0
    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
648
0
    {
649
0
        int cuIdx = (cuGeom.childOffset - 1) / 3;
650
0
        cacheCost[cuIdx] = md.bestMode->rdCost;
651
0
    }
652
653
0
    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
654
0
    {
655
0
        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
656
0
        int8_t maxTUDepth = -1;
657
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
658
0
            maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
659
0
        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
660
0
    }
661
662
    /* Copy best data to encData CTU and recon */
663
0
    md.bestMode->cu.copyToPic(depth);
664
0
    if (md.bestMode != &md.pred[PRED_SPLIT])
665
0
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
666
667
0
    return md.bestMode->rdCost;
668
0
}
669
670
void Analysis::PMODE::processTasks(int workerThreadId)
671
0
{
672
#if DETAILED_CU_STATS
673
    int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
674
    master.m_stats[fe].countPModeTasks++;
675
    ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
676
#endif
677
0
    ProfileScopeEvent(pmode);
678
0
    master.processPmode(*this, master.m_tld[workerThreadId].analysis);
679
0
}
680
681
/* process pmode jobs until none remain; may be called by the master thread or by
682
 * a bonded peer (slave) thread via pmodeTasks() */
683
void Analysis::processPmode(PMODE& pmode, Analysis& slave)
684
0
{
685
    /* acquire a mode task, else exit early */
686
0
    int task;
687
0
    pmode.m_lock.acquire();
688
0
    if (pmode.m_jobTotal > pmode.m_jobAcquired)
689
0
    {
690
0
        task = pmode.m_jobAcquired++;
691
0
        pmode.m_lock.release();
692
0
    }
693
0
    else
694
0
    {
695
0
        pmode.m_lock.release();
696
0
        return;
697
0
    }
698
699
0
    ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
700
701
    /* setup slave Analysis */
702
0
    if (&slave != this)
703
0
    {
704
0
        slave.m_slice = m_slice;
705
0
        slave.m_frame = m_frame;
706
0
        slave.m_param = m_param;
707
0
        slave.m_bChromaSa8d = m_param->rdLevel >= 3;
708
0
        slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
709
0
        slave.invalidateContexts(0);
710
0
        slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
711
0
    }
712
713
    /* perform Mode task, repeat until no more work is available */
714
0
    do
715
0
    {
716
0
        uint32_t refMasks[2] = { 0, 0 };
717
718
0
        if (m_param->rdLevel <= 4)
719
0
        {
720
0
            switch (pmode.modes[task])
721
0
            {
722
0
            case PRED_INTRA:
723
0
                slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
724
0
                if (m_param->rdLevel > 2)
725
0
                    slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
726
0
                break;
727
728
0
            case PRED_2Nx2N:
729
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
730
731
0
                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
732
0
                if (m_slice->m_sliceType == B_SLICE)
733
0
                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
734
0
                break;
735
736
0
            case PRED_Nx2N:
737
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
738
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
739
740
0
                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
741
0
                break;
742
743
0
            case PRED_2NxN:
744
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
745
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
746
747
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
748
0
                break;
749
750
0
            case PRED_2NxnU:
751
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
752
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
753
754
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
755
0
                break;
756
757
0
            case PRED_2NxnD:
758
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
759
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
760
761
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
762
0
                break;
763
764
0
            case PRED_nLx2N:
765
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
766
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
767
768
0
                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
769
0
                break;
770
771
0
            case PRED_nRx2N:
772
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
773
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
774
775
0
                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
776
0
                break;
777
778
0
            default:
779
0
                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
780
0
                break;
781
0
            }
782
0
        }
783
0
        else
784
0
        {
785
0
            switch (pmode.modes[task])
786
0
            {
787
0
            case PRED_INTRA:
788
0
                slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N);
789
0
                if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
790
0
                    slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN);
791
0
                break;
792
793
0
            case PRED_2Nx2N:
794
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
795
796
0
                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
797
0
                md.pred[PRED_BIDIR].rdCost = MAX_INT64;
798
0
                if (m_slice->m_sliceType == B_SLICE)
799
0
                {
800
0
                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
801
0
                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
802
0
                        slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
803
0
                }
804
0
                break;
805
806
0
            case PRED_Nx2N:
807
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
808
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
809
810
0
                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
811
0
                break;
812
813
0
            case PRED_2NxN:
814
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
815
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
816
817
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
818
0
                break;
819
820
0
            case PRED_2NxnU:
821
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
822
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
823
824
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
825
0
                break;
826
827
0
            case PRED_2NxnD:
828
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
829
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
830
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
831
0
                break;
832
833
0
            case PRED_nLx2N:
834
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
835
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
836
837
0
                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
838
0
                break;
839
840
0
            case PRED_nRx2N:
841
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
842
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
843
0
                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
844
0
                break;
845
846
0
            default:
847
0
                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
848
0
                break;
849
0
            }
850
0
        }
851
852
0
        task = -1;
853
0
        pmode.m_lock.acquire();
854
0
        if (pmode.m_jobTotal > pmode.m_jobAcquired)
855
0
            task = pmode.m_jobAcquired++;
856
0
        pmode.m_lock.release();
857
0
    }
858
0
    while (task >= 0);
859
0
}
860
861
uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
862
0
{
863
0
    uint32_t depth = cuGeom.depth;
864
0
    uint32_t cuAddr = parentCTU.m_cuAddr;
865
0
    ModeDepth& md = m_modeDepth[depth];
866
0
    md.bestMode = NULL;
867
868
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
869
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
870
0
    uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
871
0
    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
872
873
0
    X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
874
875
0
    PMODE pmode(*this, cuGeom);
876
877
0
    if (mightNotSplit && depth >= minDepth)
878
0
    {
879
        /* Initialize all prediction CUs based on parentCTU */
880
0
        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
881
0
        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
882
883
0
        if (m_param->rdLevel <= 4)
884
0
            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
885
0
        else
886
0
            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
887
0
    }
888
889
0
    bool bNoSplit = false;
890
0
    bool splitIntra = true;
891
0
    if (md.bestMode)
892
0
    {
893
0
        bNoSplit = md.bestMode->cu.isSkipped(0);
894
0
        if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
895
0
            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
896
0
    }
897
898
0
    if (mightSplit && !bNoSplit)
899
0
    {
900
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
901
0
        splitPred->initCosts();
902
0
        CUData* splitCU = &splitPred->cu;
903
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
904
905
0
        uint32_t nextDepth = depth + 1;
906
0
        ModeDepth& nd = m_modeDepth[nextDepth];
907
0
        invalidateContexts(nextDepth);
908
0
        Entropy* nextContext = &m_rqt[depth].cur;
909
0
        int nextQP = qp;
910
0
        splitIntra = false;
911
912
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
913
0
        {
914
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
915
0
            if (childGeom.flags & CUGeom::PRESENT)
916
0
            {
917
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
918
0
                m_rqt[nextDepth].cur.load(*nextContext);
919
920
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
921
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
922
923
0
                splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
924
925
                // Save best CU and pred data for this sub CU
926
0
                splitIntra |= nd.bestMode->cu.isIntra(0);
927
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
928
0
                splitPred->addSubCosts(*nd.bestMode);
929
930
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
931
0
                nextContext = &nd.bestMode->contexts;
932
0
            }
933
0
            else
934
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
935
0
        }
936
0
        nextContext->store(splitPred->contexts);
937
938
0
        if (mightNotSplit)
939
0
            addSplitFlagCost(*splitPred, cuGeom.depth);
940
0
        else
941
0
            updateModeCost(*splitPred);
942
943
0
        checkDQPForSplitPred(*splitPred, cuGeom);
944
0
    }
945
946
0
    if (mightNotSplit && depth >= minDepth)
947
0
    {
948
0
        int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
949
0
        int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE);
950
951
0
        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
952
0
            setLambdaFromQP(parentCTU, qp);
953
954
0
        if (bTryIntra)
955
0
        {
956
0
            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
957
0
            if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
958
0
                md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
959
0
            pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
960
0
        }
961
0
        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
962
0
        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
963
0
        if (m_param->bEnableRectInter)
964
0
        {
965
0
            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
966
0
            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
967
0
        }
968
0
        if (bTryAmp)
969
0
        {
970
0
            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
971
0
            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
972
0
            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
973
0
            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
974
0
        }
975
976
0
        m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
977
978
0
        pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
979
980
        /* participate in processing jobs, until all are distributed */
981
0
        processPmode(pmode, *this);
982
983
        /* the master worker thread (this one) does merge analysis. By doing
984
         * merge after all the other jobs are at least started, we usually avoid
985
         * blocking on another thread */
986
987
0
        if (m_param->rdLevel <= 4)
988
0
        {
989
0
            {
990
0
                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
991
0
                pmode.waitForExit();
992
0
            }
993
994
            /* select best inter mode based on sa8d cost */
995
0
            Mode *bestInter = &md.pred[PRED_2Nx2N];
996
997
0
            if (m_param->bEnableRectInter)
998
0
            {
999
0
                if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1000
0
                    bestInter = &md.pred[PRED_Nx2N];
1001
0
                if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1002
0
                    bestInter = &md.pred[PRED_2NxN];
1003
0
            }
1004
1005
0
            if (bTryAmp)
1006
0
            {
1007
0
                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1008
0
                    bestInter = &md.pred[PRED_2NxnU];
1009
0
                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1010
0
                    bestInter = &md.pred[PRED_2NxnD];
1011
0
                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1012
0
                    bestInter = &md.pred[PRED_nLx2N];
1013
0
                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1014
0
                    bestInter = &md.pred[PRED_nRx2N];
1015
0
            }
1016
1017
0
            if (m_param->rdLevel > 2)
1018
0
            {
1019
                /* RD selection between merge, inter, bidir and intra */
1020
0
                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1021
0
                {
1022
0
                    uint32_t numPU = bestInter->cu.getNumPartInter(0);
1023
0
                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1024
0
                    {
1025
0
                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1026
0
                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1027
0
                    }
1028
0
                }
1029
0
                encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1030
0
                checkBestMode(*bestInter, depth);
1031
1032
                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1033
0
                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1034
0
                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1035
0
                {
1036
0
                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1037
0
                    checkBestMode(md.pred[PRED_BIDIR], depth);
1038
0
                }
1039
1040
0
                if (bTryIntra)
1041
0
                    checkBestMode(md.pred[PRED_INTRA], depth);
1042
0
            }
1043
0
            else /* m_param->rdLevel == 2 */
1044
0
            {
1045
0
                if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1046
0
                    md.bestMode = bestInter;
1047
1048
0
                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1049
0
                    md.bestMode = &md.pred[PRED_BIDIR];
1050
1051
0
                if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1052
0
                {
1053
0
                    md.bestMode = &md.pred[PRED_INTRA];
1054
0
                    encodeIntraInInter(*md.bestMode, cuGeom);
1055
0
                }
1056
0
                else if (!md.bestMode->cu.m_mergeFlag[0])
1057
0
                {
1058
                    /* finally code the best mode selected from SA8D costs */
1059
0
                    uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1060
0
                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1061
0
                    {
1062
0
                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1063
0
                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1064
0
                    }
1065
0
                    encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1066
0
                }
1067
0
            }
1068
0
        }
1069
0
        else
1070
0
        {
1071
0
            {
1072
0
                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1073
0
                pmode.waitForExit();
1074
0
            }
1075
1076
0
            checkBestMode(md.pred[PRED_2Nx2N], depth);
1077
0
            if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1078
0
                checkBestMode(md.pred[PRED_BIDIR], depth);
1079
1080
0
            if (m_param->bEnableRectInter)
1081
0
            {
1082
0
                checkBestMode(md.pred[PRED_Nx2N], depth);
1083
0
                checkBestMode(md.pred[PRED_2NxN], depth);
1084
0
            }
1085
1086
0
            if (bTryAmp)
1087
0
            {
1088
0
                checkBestMode(md.pred[PRED_2NxnU], depth);
1089
0
                checkBestMode(md.pred[PRED_2NxnD], depth);
1090
0
                checkBestMode(md.pred[PRED_nLx2N], depth);
1091
0
                checkBestMode(md.pred[PRED_nRx2N], depth);
1092
0
            }
1093
1094
0
            if (bTryIntra)
1095
0
            {
1096
0
                checkBestMode(md.pred[PRED_INTRA], depth);
1097
0
                if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1098
0
                    checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1099
0
            }
1100
0
        }
1101
1102
0
        if (m_bTryLossless)
1103
0
            tryLossless(cuGeom);
1104
1105
0
        if (mightSplit)
1106
0
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
1107
0
    }
1108
1109
    /* compare split RD cost against best cost */
1110
0
    if (mightSplit && !bNoSplit)
1111
0
        checkBestMode(md.pred[PRED_SPLIT], depth);
1112
1113
    /* determine which motion references the parent CU should search */
1114
0
    uint32_t refMask;
1115
0
    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1116
0
        refMask = 0;
1117
0
    else if (md.bestMode == &md.pred[PRED_SPLIT])
1118
0
        refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1119
0
    else
1120
0
    {
1121
        /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1122
0
        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1123
0
        uint32_t numPU = cu.getNumPartInter(0);
1124
0
        refMask = 0;
1125
0
        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1126
0
            refMask |= cu.getBestRefIdx(subPartIdx);
1127
0
    }
1128
1129
0
    if (mightNotSplit)
1130
0
    {
1131
        /* early-out statistics */
1132
0
        FrameData& curEncData = *m_frame->m_encData;
1133
0
        FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1134
0
        uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1135
0
        cuStat.count[depth] += 1;
1136
0
        cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1137
0
    }
1138
1139
    /* Copy best data to encData CTU and recon */
1140
0
    md.bestMode->cu.copyToPic(depth);
1141
0
    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
1142
1143
0
    return refMask;
1144
0
}
1145
1146
SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1147
0
{
1148
0
    if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))
1149
0
        return compressInterCU_rd5_6(parentCTU, cuGeom, qp);
1150
1151
0
    uint32_t depth = cuGeom.depth;
1152
0
    uint32_t cuAddr = parentCTU.m_cuAddr;
1153
0
    ModeDepth& md = m_modeDepth[depth];
1154
1155
1156
0
    if (m_param->searchMethod == X265_SEA)
1157
0
    {
1158
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
1159
0
        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
1160
0
        for (int list = 0; list < numPredDir; list++)
1161
0
            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1162
0
                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1163
0
                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1164
0
    }
1165
1166
0
    PicYuv& reconPic = *m_frame->m_reconPic;
1167
0
    SplitData splitCUData;
1168
1169
0
    bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1170
0
    bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1171
0
    bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1172
1173
0
    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1174
0
    {
1175
0
        md.bestMode = NULL;
1176
0
        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1177
0
        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1178
0
        uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
1179
0
        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1180
0
        bool skipModes = false; /* Skip any remaining mode analyses at current depth */
1181
0
        bool skipRecursion = false; /* Skip recursion */
1182
0
        bool splitIntra = true;
1183
0
        bool skipRectAmp = false;
1184
0
        bool chooseMerge = false;
1185
0
        bool bCtuInfoCheck = false;
1186
0
        int sameContentRef = 0;
1187
1188
0
        if (m_evaluateInter)
1189
0
        {
1190
0
            if (m_refineLevel == 2)
1191
0
            {
1192
0
                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1193
0
                    skipModes = true;
1194
0
                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1195
0
                    skipRectAmp = true;
1196
0
            }
1197
0
            mightSplit &= false;
1198
0
            minDepth = depth;
1199
0
        }
1200
1201
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1202
0
            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1203
1204
0
        SplitData splitData[4];
1205
0
        splitData[0].initSplitCUData();
1206
0
        splitData[1].initSplitCUData();
1207
0
        splitData[2].initSplitCUData();
1208
0
        splitData[3].initSplitCUData();
1209
1210
        // avoid uninitialize value in below reference
1211
0
        if (m_param->limitModes)
1212
0
        {
1213
0
            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1214
0
            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1215
0
            md.pred[PRED_2Nx2N].sa8dCost = 0;
1216
0
        }
1217
1218
0
        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1219
0
        {
1220
0
            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1221
0
                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1222
0
            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1223
0
            {
1224
0
                mightNotSplit &= bDecidedDepth;
1225
0
                bCtuInfoCheck = skipRecursion = false;
1226
0
                skipModes = true;
1227
0
            }
1228
0
            else if (mightNotSplit && bDecidedDepth)
1229
0
            {
1230
0
                if (m_additionalCtuInfo[cuGeom.absPartIdx])
1231
0
                {
1232
0
                    bCtuInfoCheck = skipRecursion = true;
1233
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1234
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1235
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1236
0
                    if (!sameContentRef)
1237
0
                    {
1238
0
                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1239
0
                        {
1240
0
                            qp -= int32_t(0.04 * qp);
1241
0
                            setLambdaFromQP(parentCTU, qp);
1242
0
                        }
1243
0
                        if (m_param->bCTUInfo & 4)
1244
0
                            skipModes = false;
1245
0
                    }
1246
0
                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1247
0
                    {
1248
0
                        if (m_param->rdLevel)
1249
0
                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1250
0
                        if ((m_param->bCTUInfo & 4) && sameContentRef)
1251
0
                            skipModes = md.bestMode && true;
1252
0
                    }
1253
0
                }
1254
0
                else
1255
0
                {
1256
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1257
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1258
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1259
0
                    if (m_param->rdLevel)
1260
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1261
0
                }
1262
0
                mightSplit &= !bDecidedDepth;
1263
0
            }
1264
0
        }
1265
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10))
1266
0
        {
1267
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1268
0
            {
1269
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1270
0
                {
1271
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1272
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1273
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1274
1275
0
                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1276
0
                    if (m_param->rdLevel)
1277
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
1278
0
                }
1279
0
                if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1280
0
                {
1281
0
                    if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
1282
0
                    {
1283
0
                        skipRectAmp = true && !!md.bestMode;
1284
0
                        chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
1285
0
                    }
1286
0
                }
1287
0
            }
1288
0
        }
1289
0
        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1290
0
        {
1291
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1292
0
            {
1293
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1294
0
                {
1295
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1296
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1297
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1298
1299
0
                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1300
0
                    if (m_param->rdLevel)
1301
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
1302
0
                }
1303
0
            }
1304
0
        }
1305
        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
1306
0
        if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1307
            /* TODO: Re-evaluate if analysis load/save still works */
1308
0
        {
1309
            /* Compute Merge Cost */
1310
0
            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1311
0
            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1312
0
            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1313
0
            if (m_param->rdLevel)
1314
0
                skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
1315
0
                && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
1316
0
        }
1317
0
        if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1318
0
        {
1319
0
            skipRecursion = md.bestMode->cu.isSkipped(0);
1320
0
            if (mightSplit && !skipRecursion)
1321
0
            {
1322
0
                if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
1323
0
                {
1324
0
                    if (depth)
1325
0
                        skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1326
0
                    if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
1327
0
                        skipRecursion = complexityCheckCU(*md.bestMode);
1328
0
                }
1329
0
                else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
1330
0
                {
1331
0
                    skipRecursion = complexityCheckCU(*md.bestMode);
1332
0
                }
1333
1334
0
            }
1335
0
        }
1336
0
        if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
1337
0
            skipRecursion = true;
1338
        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
1339
0
        if (mightSplit && !skipRecursion)
1340
0
        {
1341
0
            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
1342
0
                qp = int((1 / 0.96) * qp + 0.5);
1343
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
1344
0
            splitPred->initCosts();
1345
0
            CUData* splitCU = &splitPred->cu;
1346
0
            splitCU->initSubCU(parentCTU, cuGeom, qp);
1347
1348
0
            uint32_t nextDepth = depth + 1;
1349
0
            ModeDepth& nd = m_modeDepth[nextDepth];
1350
0
            invalidateContexts(nextDepth);
1351
0
            Entropy* nextContext = &m_rqt[depth].cur;
1352
0
            int nextQP = qp;
1353
0
            splitIntra = false;
1354
1355
0
            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1356
0
            {
1357
0
                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1358
0
                if (childGeom.flags & CUGeom::PRESENT)
1359
0
                {
1360
0
                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1361
0
                    m_rqt[nextDepth].cur.load(*nextContext);
1362
1363
0
                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1364
0
                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1365
1366
0
                    splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
1367
1368
                    // Save best CU and pred data for this sub CU
1369
0
                    splitIntra |= nd.bestMode->cu.isIntra(0);
1370
0
                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1371
0
                    splitPred->addSubCosts(*nd.bestMode);
1372
1373
0
                    if (m_param->rdLevel)
1374
0
                        nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1375
0
                    else
1376
0
                        nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
1377
0
                    if (m_param->rdLevel > 1)
1378
0
                        nextContext = &nd.bestMode->contexts;
1379
0
                }
1380
0
                else
1381
0
                    splitCU->setEmptyPart(childGeom, subPartIdx);
1382
0
            }
1383
0
            nextContext->store(splitPred->contexts);
1384
1385
0
            if (mightNotSplit)
1386
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
1387
0
            else if (m_param->rdLevel > 1)
1388
0
                updateModeCost(*splitPred);
1389
0
            else
1390
0
                splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
1391
0
        }
1392
        /* If analysis mode is simple do not Evaluate other modes */
1393
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
1394
0
        {
1395
0
            if (m_slice->m_sliceType == P_SLICE)
1396
0
            {
1397
0
                if (m_checkMergeAndSkipOnly[0])
1398
0
                    skipModes = true;
1399
0
            }
1400
0
            else
1401
0
            {
1402
0
                if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
1403
0
                    skipModes = true;
1404
0
            }
1405
0
        }
1406
        /* Split CUs
1407
         *   0  1
1408
         *   2  3 */
1409
0
        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1410
        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1411
0
        if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
1412
0
        {
1413
0
            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1414
0
                setLambdaFromQP(parentCTU, qp);
1415
1416
0
            if (!skipModes)
1417
0
            {
1418
0
                uint32_t refMasks[2];
1419
0
                refMasks[0] = allSplitRefs;
1420
0
                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1421
0
                checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1422
1423
0
                if (m_param->limitReferences & X265_REF_LIMIT_CU)
1424
0
                {
1425
0
                    CUData& cu = md.pred[PRED_2Nx2N].cu;
1426
0
                    uint32_t refMask = cu.getBestRefIdx(0);
1427
0
                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
1428
0
                }
1429
1430
0
                if (m_slice->m_sliceType == B_SLICE)
1431
0
                {
1432
0
                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1433
0
                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1434
0
                }
1435
1436
0
                Mode *bestInter = &md.pred[PRED_2Nx2N];
1437
0
                if (!skipRectAmp)
1438
0
                {
1439
0
                    if (m_param->bEnableRectInter)
1440
0
                    {
1441
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1442
0
                        uint32_t threshold_2NxN, threshold_Nx2N;
1443
1444
0
                        if (m_slice->m_sliceType == P_SLICE)
1445
0
                        {
1446
0
                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1447
0
                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1448
0
                        }
1449
0
                        else
1450
0
                        {
1451
0
                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1452
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1453
0
                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1454
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1455
0
                        }
1456
1457
0
                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
1458
0
                        if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1459
0
                        {
1460
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1461
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1462
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1463
0
                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1464
0
                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1465
0
                                bestInter = &md.pred[PRED_2NxN];
1466
0
                        }
1467
1468
0
                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
1469
0
                        {
1470
0
                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
1471
0
                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
1472
0
                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1473
0
                            checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1474
0
                            if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1475
0
                                bestInter = &md.pred[PRED_Nx2N];
1476
0
                        }
1477
1478
0
                        if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1479
0
                        {
1480
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1481
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1482
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1483
0
                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1484
0
                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1485
0
                                bestInter = &md.pred[PRED_2NxN];
1486
0
                        }
1487
0
                    }
1488
1489
0
                    if (m_slice->m_sps->maxAMPDepth > depth)
1490
0
                    {
1491
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1492
0
                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
1493
1494
0
                        if (m_slice->m_sliceType == P_SLICE)
1495
0
                        {
1496
0
                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1497
0
                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
1498
1499
0
                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1500
0
                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
1501
0
                        }
1502
0
                        else
1503
0
                        {
1504
0
                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1505
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1506
0
                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
1507
0
                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1508
1509
0
                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1510
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1511
0
                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
1512
0
                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1513
0
                        }
1514
1515
0
                        bool bHor = false, bVer = false;
1516
0
                        if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
1517
0
                            bHor = true;
1518
0
                        else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
1519
0
                            bVer = true;
1520
0
                        else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
1521
0
                            md.bestMode && md.bestMode->cu.getQtRootCbf(0))
1522
0
                        {
1523
0
                            bHor = true;
1524
0
                            bVer = true;
1525
0
                        }
1526
1527
0
                        if (bHor)
1528
0
                        {
1529
0
                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1530
0
                            if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1531
0
                            {
1532
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
1533
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1534
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1535
0
                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1536
0
                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1537
0
                                    bestInter = &md.pred[PRED_2NxnD];
1538
0
                            }
1539
1540
0
                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
1541
0
                            {
1542
0
                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1543
0
                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
1544
0
                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1545
0
                                checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1546
0
                                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1547
0
                                    bestInter = &md.pred[PRED_2NxnU];
1548
0
                            }
1549
1550
0
                            if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1551
0
                            {
1552
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
1553
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1554
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1555
0
                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1556
0
                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1557
0
                                    bestInter = &md.pred[PRED_2NxnD];
1558
0
                            }
1559
0
                        }
1560
0
                        if (bVer)
1561
0
                        {
1562
0
                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1563
0
                            if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1564
0
                            {
1565
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
1566
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1567
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1568
0
                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1569
0
                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1570
0
                                    bestInter = &md.pred[PRED_nRx2N];
1571
0
                            }
1572
1573
0
                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
1574
0
                            {
1575
0
                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
1576
0
                                refMasks[1] = allSplitRefs;                                    /* 75% right */
1577
0
                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1578
0
                                checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1579
0
                                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1580
0
                                    bestInter = &md.pred[PRED_nLx2N];
1581
0
                            }
1582
1583
0
                            if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1584
0
                            {
1585
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
1586
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1587
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1588
0
                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1589
0
                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1590
0
                                    bestInter = &md.pred[PRED_nRx2N];
1591
0
                            }
1592
0
                        }
1593
0
                    }
1594
0
                }
1595
0
                bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
1596
0
                if (m_param->rdLevel >= 3)
1597
0
                {
1598
                    /* Calculate RD cost of best inter option */
1599
0
                    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1600
0
                    {
1601
0
                        uint32_t numPU = bestInter->cu.getNumPartInter(0);
1602
0
                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1603
0
                        {
1604
0
                            PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1605
0
                            motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1606
0
                        }
1607
0
                    }
1608
1609
0
                    if (!chooseMerge)
1610
0
                    {
1611
0
                        encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1612
0
                        checkBestMode(*bestInter, depth);
1613
1614
                        /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1615
0
                        if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1616
0
                            md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1617
0
                        {
1618
0
                            uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
1619
0
                            if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
1620
0
                                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1621
0
                                {
1622
0
                                    PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
1623
0
                                    motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
1624
0
                                }
1625
0
                            encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1626
0
                            checkBestMode(md.pred[PRED_BIDIR], depth);
1627
0
                        }
1628
0
                    }
1629
1630
0
                    if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
1631
0
                        md.bestMode->sa8dCost == MAX_INT64)
1632
0
                    {
1633
0
                        if (!m_param->limitReferences || splitIntra)
1634
0
                        {
1635
0
                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1636
0
                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1637
0
                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1638
0
                            encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
1639
0
                            checkBestMode(md.pred[PRED_INTRA], depth);
1640
0
                        }
1641
0
                        else
1642
0
                        {
1643
0
                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1644
0
                        }
1645
0
                    }
1646
0
                }
1647
0
                else
1648
0
                {
1649
                    /* SA8D choice between merge/skip, inter, bidir, and intra */
1650
0
                    if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1651
0
                        md.bestMode = bestInter;
1652
1653
0
                    if (m_slice->m_sliceType == B_SLICE &&
1654
0
                        md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1655
0
                        md.bestMode = &md.pred[PRED_BIDIR];
1656
1657
0
                    if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
1658
0
                    {
1659
0
                        if (!m_param->limitReferences || splitIntra)
1660
0
                        {
1661
0
                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1662
0
                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1663
0
                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1664
0
                            if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1665
0
                                md.bestMode = &md.pred[PRED_INTRA];
1666
0
                        }
1667
0
                        else
1668
0
                        {
1669
0
                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1670
0
                        }
1671
0
                    }
1672
1673
                    /* finally code the best mode selected by SA8D costs:
1674
                     * RD level 2 - fully encode the best mode
1675
                     * RD level 1 - generate recon pixels
1676
                     * RD level 0 - generate chroma prediction */
1677
0
                    if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
1678
0
                    {
1679
                        /* prediction already generated for this CU, and if rd level
1680
                         * is not 0, it is already fully encoded */
1681
0
                    }
1682
0
                    else if (md.bestMode->cu.isInter(0))
1683
0
                    {
1684
0
                        uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1685
0
                        if (m_csp != X265_CSP_I400)
1686
0
                        {
1687
0
                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1688
0
                            {
1689
0
                                PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1690
0
                                motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1691
0
                            }
1692
0
                        }
1693
0
                        if (m_param->rdLevel == 2)
1694
0
                            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1695
0
                        else if (m_param->rdLevel == 1)
1696
0
                        {
1697
                            /* generate recon pixels with no rate distortion considerations */
1698
0
                            CUData& cu = md.bestMode->cu;
1699
1700
0
                            uint32_t tuDepthRange[2];
1701
0
                            cu.getInterTUQtDepthRange(tuDepthRange, 0);
1702
0
                            m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
1703
0
                            residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1704
0
                            if (cu.getQtRootCbf(0))
1705
0
                                md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
1706
0
                            else
1707
0
                            {
1708
0
                                md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
1709
0
                                if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
1710
0
                                    cu.setPredModeSubParts(MODE_SKIP);
1711
0
                            }
1712
0
                        }
1713
0
                    }
1714
0
                    else
1715
0
                    {
1716
0
                        if (m_param->rdLevel == 2)
1717
0
                            encodeIntraInInter(*md.bestMode, cuGeom);
1718
0
                        else if (m_param->rdLevel == 1)
1719
0
                        {
1720
                            /* generate recon pixels with no rate distortion considerations */
1721
0
                            CUData& cu = md.bestMode->cu;
1722
1723
0
                            uint32_t tuDepthRange[2];
1724
0
                            cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1725
1726
0
                            residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
1727
0
                            if (m_csp != X265_CSP_I400)
1728
0
                            {
1729
0
                                getBestIntraModeChroma(*md.bestMode, cuGeom);
1730
0
                                residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
1731
0
                            }
1732
0
                            md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
1733
0
                        }
1734
0
                    }
1735
0
                }
1736
0
            } // !earlyskip
1737
1738
0
            if (m_bTryLossless)
1739
0
                tryLossless(cuGeom);
1740
1741
0
            if (mightSplit)
1742
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
1743
0
        }
1744
1745
0
        if (mightSplit && !skipRecursion)
1746
0
        {
1747
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
1748
0
            if (!md.bestMode)
1749
0
                md.bestMode = splitPred;
1750
0
            else if (m_param->rdLevel > 1)
1751
0
                checkBestMode(*splitPred, cuGeom.depth);
1752
0
            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1753
0
                md.bestMode = splitPred;
1754
1755
0
            checkDQPForSplitPred(*md.bestMode, cuGeom);
1756
0
        }
1757
1758
        /* determine which motion references the parent CU should search */
1759
0
        splitCUData.initSplitCUData();
1760
1761
0
        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1762
0
        {
1763
0
            if (md.bestMode == &md.pred[PRED_SPLIT])
1764
0
                splitCUData.splitRefs = allSplitRefs;
1765
0
            else
1766
0
            {
1767
                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1768
0
                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1769
0
                uint32_t numPU = cu.getNumPartInter(0);
1770
0
                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1771
0
                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1772
0
            }
1773
0
        }
1774
1775
0
        if (m_param->limitModes)
1776
0
        {
1777
0
            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1778
0
            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1779
0
            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1780
0
        }
1781
1782
0
        if (mightNotSplit && md.bestMode->cu.isSkipped(0))
1783
0
        {
1784
0
            FrameData& curEncData = *m_frame->m_encData;
1785
0
            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1786
0
            uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1787
0
            cuStat.count[depth] += 1;
1788
0
            cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1789
0
        }
1790
1791
        /* Copy best data to encData CTU and recon */
1792
0
        md.bestMode->cu.copyToPic(depth);
1793
0
        if (m_param->rdLevel)
1794
0
            md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
1795
1796
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1797
0
        {
1798
0
            if (mightNotSplit)
1799
0
            {
1800
0
                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
1801
0
                int8_t maxTUDepth = -1;
1802
0
                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
1803
0
                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
1804
0
                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
1805
0
            }
1806
0
        }
1807
0
    }
1808
0
    else
1809
0
    {
1810
0
        if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
1811
0
        {
1812
0
            qprdRefine(parentCTU, cuGeom, qp, qp);
1813
1814
0
            SplitData splitData[4];
1815
0
            splitData[0].initSplitCUData();
1816
0
            splitData[1].initSplitCUData();
1817
0
            splitData[2].initSplitCUData();
1818
0
            splitData[3].initSplitCUData();
1819
1820
0
            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1821
1822
0
            splitCUData.initSplitCUData();
1823
1824
0
            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1825
0
            {
1826
0
                if (md.bestMode == &md.pred[PRED_SPLIT])
1827
0
                    splitCUData.splitRefs = allSplitRefs;
1828
0
                else
1829
0
                {
1830
                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1831
0
                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1832
0
                    uint32_t numPU = cu.getNumPartInter(0);
1833
0
                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1834
0
                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1835
0
                }
1836
0
            }
1837
1838
0
            if (m_param->limitModes)
1839
0
            {
1840
0
                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1841
0
                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1842
0
                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
1843
0
            }
1844
0
        }
1845
0
    }
1846
1847
0
    return splitCUData;
1848
0
}
1849
1850
SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1851
0
{
1852
0
    if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
1853
0
        return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
1854
1855
0
    uint32_t depth = cuGeom.depth;
1856
0
    ModeDepth& md = m_modeDepth[depth];
1857
0
    md.bestMode = NULL;
1858
1859
0
    if (m_param->searchMethod == X265_SEA)
1860
0
    {
1861
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
1862
0
        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
1863
0
        for (int list = 0; list < numPredDir; list++)
1864
0
            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1865
0
                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1866
0
                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1867
0
    }
1868
1869
0
    SplitData splitCUData;
1870
1871
0
    bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1872
0
    bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1873
0
    bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1874
1875
0
    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1876
0
    {
1877
0
        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1878
0
        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1879
0
        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1880
0
        bool skipRecursion = false;
1881
0
        bool skipModes = false;
1882
0
        bool splitIntra = true;
1883
0
        bool skipRectAmp = false;
1884
0
        bool bCtuInfoCheck = false;
1885
0
        int sameContentRef = 0;
1886
1887
0
        if (m_evaluateInter)
1888
0
        {
1889
0
            if (m_refineLevel == 2)
1890
0
            {
1891
0
                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1892
0
                    skipModes = true;
1893
0
                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1894
0
                    skipRectAmp = true;
1895
0
            }
1896
0
            mightSplit &= false;
1897
0
        }
1898
1899
        // avoid uninitialize value in below reference
1900
0
        if (m_param->limitModes)
1901
0
        {
1902
0
            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1903
0
            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1904
0
            md.pred[PRED_2Nx2N].rdCost = 0;
1905
0
        }
1906
1907
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1908
0
            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1909
1910
0
        SplitData splitData[4];
1911
0
        splitData[0].initSplitCUData();
1912
0
        splitData[1].initSplitCUData();
1913
0
        splitData[2].initSplitCUData();
1914
0
        splitData[3].initSplitCUData();
1915
0
        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1916
0
        uint32_t refMasks[2];
1917
0
        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1918
0
        {
1919
0
            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1920
0
                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1921
0
            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1922
0
            {
1923
0
                mightNotSplit &= bDecidedDepth;
1924
0
                bCtuInfoCheck = skipRecursion = false;
1925
0
                skipModes = true;
1926
0
            }
1927
0
            else if (mightNotSplit && bDecidedDepth)
1928
0
            {
1929
0
                if (m_additionalCtuInfo[cuGeom.absPartIdx])
1930
0
                {
1931
0
                    bCtuInfoCheck = skipRecursion = true;
1932
0
                    refMasks[0] = allSplitRefs;
1933
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1934
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1935
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1936
0
                    if (!sameContentRef)
1937
0
                    {
1938
0
                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1939
0
                        {
1940
0
                            qp -= int32_t(0.04 * qp);
1941
0
                            setLambdaFromQP(parentCTU, qp);
1942
0
                        }
1943
0
                        if (m_param->bCTUInfo & 4)
1944
0
                            skipModes = false;
1945
0
                    }
1946
0
                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1947
0
                    {
1948
0
                        if (m_param->rdLevel)
1949
0
                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1950
0
                        if ((m_param->bCTUInfo & 4) && sameContentRef)
1951
0
                            skipModes = md.bestMode && true;
1952
0
                    }
1953
0
                }
1954
0
                else
1955
0
                {
1956
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1957
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1958
0
                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1959
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
1960
0
                    refMasks[0] = allSplitRefs;
1961
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1962
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1963
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1964
0
                }
1965
0
                mightSplit &= !bDecidedDepth;
1966
0
            }
1967
0
        }
1968
0
        if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
1969
0
        {
1970
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1971
0
            {
1972
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1973
0
                {
1974
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1975
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1976
0
                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1977
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
1978
0
                    refMasks[0] = allSplitRefs;
1979
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1980
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1981
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1982
1983
0
                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
1984
0
                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
1985
0
                }
1986
0
                if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1987
0
                    skipRectAmp = true && !!md.bestMode;
1988
0
            }
1989
0
        }
1990
1991
0
        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1992
0
        {
1993
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1994
0
            {
1995
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1996
0
                {
1997
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1998
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1999
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2000
2001
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2002
0
                    refMasks[0] = allSplitRefs;
2003
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2004
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2005
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2006
2007
0
                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2008
0
                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2009
0
                }
2010
0
            }
2011
0
        }
2012
        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
2013
0
        if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) ||
2014
0
            (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
2015
0
        {
2016
0
            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2017
0
            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2018
0
            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2019
0
            skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
2020
0
                md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2021
0
            refMasks[0] = allSplitRefs;
2022
0
            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2023
0
            checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2024
0
            checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2025
2026
0
            if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
2027
0
                skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2028
0
            else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
2029
0
                skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
2030
0
        }
2031
0
        if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
2032
0
            skipRecursion = true;
2033
        // estimate split cost
2034
        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
2035
0
        if (mightSplit && !skipRecursion)
2036
0
        {
2037
0
            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
2038
0
                qp = int((1 / 0.96) * qp + 0.5);
2039
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
2040
0
            splitPred->initCosts();
2041
0
            CUData* splitCU = &splitPred->cu;
2042
0
            splitCU->initSubCU(parentCTU, cuGeom, qp);
2043
2044
0
            uint32_t nextDepth = depth + 1;
2045
0
            ModeDepth& nd = m_modeDepth[nextDepth];
2046
0
            invalidateContexts(nextDepth);
2047
0
            Entropy* nextContext = &m_rqt[depth].cur;
2048
0
            int nextQP = qp;
2049
0
            splitIntra = false;
2050
2051
0
            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2052
0
            {
2053
0
                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2054
0
                if (childGeom.flags & CUGeom::PRESENT)
2055
0
                {
2056
0
                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2057
0
                    m_rqt[nextDepth].cur.load(*nextContext);
2058
2059
0
                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2060
0
                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2061
2062
0
                    splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
2063
2064
                    // Save best CU and pred data for this sub CU
2065
0
                    splitIntra |= nd.bestMode->cu.isIntra(0);
2066
0
                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2067
0
                    splitPred->addSubCosts(*nd.bestMode);
2068
0
                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2069
0
                    nextContext = &nd.bestMode->contexts;
2070
0
                }
2071
0
                else
2072
0
                {
2073
0
                    splitCU->setEmptyPart(childGeom, subPartIdx);
2074
0
                }
2075
0
            }
2076
0
            nextContext->store(splitPred->contexts);
2077
0
            if (mightNotSplit)
2078
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
2079
0
            else
2080
0
                updateModeCost(*splitPred);
2081
2082
0
            checkDQPForSplitPred(*splitPred, cuGeom);
2083
0
        }
2084
        /* If analysis mode is simple do not Evaluate other modes */
2085
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2086
0
        {
2087
0
            if (m_slice->m_sliceType == P_SLICE)
2088
0
            {
2089
0
                if (m_checkMergeAndSkipOnly[0])
2090
0
                    skipModes = true;
2091
0
            }
2092
0
            else
2093
0
            {
2094
0
                if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
2095
0
                    skipModes = true;
2096
0
            }
2097
0
        }
2098
        /* Split CUs
2099
         *   0  1
2100
         *   2  3 */
2101
0
        allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2102
        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
2103
0
        if (mightNotSplit)
2104
0
        {
2105
0
            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
2106
0
                setLambdaFromQP(parentCTU, qp);
2107
2108
0
            if (!skipModes)
2109
0
            {
2110
0
                refMasks[0] = allSplitRefs;
2111
2112
0
                if (m_param->limitReferences & X265_REF_LIMIT_CU)
2113
0
                {
2114
0
                    CUData& cu = md.pred[PRED_2Nx2N].cu;
2115
0
                    uint32_t refMask = cu.getBestRefIdx(0);
2116
0
                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
2117
0
                }
2118
2119
0
                if (m_slice->m_sliceType == B_SLICE)
2120
0
                {
2121
0
                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
2122
0
                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
2123
0
                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
2124
0
                    {
2125
0
                        uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
2126
0
                        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
2127
0
                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2128
0
                            {
2129
0
                                PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
2130
0
                                motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
2131
0
                            }
2132
0
                        encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
2133
0
                        checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
2134
0
                    }
2135
0
                }
2136
2137
0
                if (!skipRectAmp)
2138
0
                {
2139
0
                    if (m_param->bEnableRectInter)
2140
0
                    {
2141
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2142
0
                        uint32_t threshold_2NxN, threshold_Nx2N;
2143
2144
0
                        if (m_slice->m_sliceType == P_SLICE)
2145
0
                        {
2146
0
                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2147
0
                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2148
0
                        }
2149
0
                        else
2150
0
                        {
2151
0
                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2152
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2153
0
                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2154
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2155
0
                        }
2156
2157
0
                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
2158
0
                        if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2159
0
                        {
2160
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2161
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2162
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2163
0
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2164
0
                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2165
0
                        }
2166
2167
0
                        if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
2168
0
                        {
2169
0
                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
2170
0
                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
2171
0
                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2172
0
                            checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
2173
0
                            checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
2174
0
                        }
2175
2176
0
                        if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2177
0
                        {
2178
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2179
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2180
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2181
0
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2182
0
                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2183
0
                        }
2184
0
                    }
2185
2186
                    // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
2187
0
                    if (m_slice->m_sps->maxAMPDepth > depth)
2188
0
                    {
2189
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2190
0
                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
2191
2192
0
                        if (m_slice->m_sliceType == P_SLICE)
2193
0
                        {
2194
0
                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2195
0
                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
2196
2197
0
                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2198
0
                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
2199
0
                        }
2200
0
                        else
2201
0
                        {
2202
0
                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2203
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2204
0
                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
2205
0
                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2206
2207
0
                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2208
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2209
0
                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
2210
0
                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2211
0
                        }
2212
2213
0
                        bool bHor = false, bVer = false;
2214
0
                        if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
2215
0
                            bHor = true;
2216
0
                        else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
2217
0
                            bVer = true;
2218
0
                        else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
2219
0
                        {
2220
0
                            bHor = true;
2221
0
                            bVer = true;
2222
0
                        }
2223
2224
0
                        if (bHor)
2225
0
                        {
2226
0
                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
2227
0
                            if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2228
0
                            {
2229
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
2230
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2231
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2232
0
                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2233
0
                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2234
0
                            }
2235
2236
0
                            if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
2237
0
                            {
2238
0
                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
2239
0
                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
2240
0
                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
2241
0
                                checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
2242
0
                                checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
2243
0
                            }
2244
2245
0
                            if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2246
0
                            {
2247
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
2248
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2249
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2250
0
                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2251
0
                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2252
0
                            }
2253
0
                        }
2254
2255
0
                        if (bVer)
2256
0
                        {
2257
0
                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
2258
0
                            if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2259
0
                            {
2260
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
2261
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2262
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2263
0
                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2264
0
                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2265
0
                            }
2266
2267
0
                            if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
2268
0
                            {
2269
0
                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
2270
0
                                refMasks[1] = allSplitRefs;                                    /* 75% right */
2271
0
                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2272
0
                                checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
2273
0
                                checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
2274
0
                            }
2275
2276
0
                            if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2277
0
                            {
2278
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
2279
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2280
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2281
0
                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2282
0
                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2283
0
                            }
2284
0
                        }
2285
0
                    }
2286
0
                }
2287
2288
0
                if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
2289
0
                {
2290
0
                    if (!m_param->limitReferences || splitIntra)
2291
0
                    {
2292
0
                        ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
2293
0
                        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
2294
0
                        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
2295
0
                        checkBestMode(md.pred[PRED_INTRA], depth);
2296
2297
0
                        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
2298
0
                        {
2299
0
                            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2300
0
                            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
2301
0
                            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
2302
0
                        }
2303
0
                    }
2304
0
                    else
2305
0
                    {
2306
0
                        ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
2307
0
                    }
2308
0
                }
2309
0
            }
2310
2311
0
            if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
2312
0
            {
2313
0
                uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
2314
2315
0
                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2316
0
                {
2317
0
                    PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
2318
0
                    motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
2319
0
                }
2320
0
                encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
2321
0
            }
2322
0
            if (m_bTryLossless)
2323
0
                tryLossless(cuGeom);
2324
2325
0
            if (mightSplit)
2326
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
2327
0
        }
2328
2329
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2330
0
        {
2331
0
            if (mightNotSplit)
2332
0
            {
2333
0
                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
2334
0
                int8_t maxTUDepth = -1;
2335
0
                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
2336
0
                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
2337
0
                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
2338
0
            }
2339
0
        }
2340
2341
        /* compare split RD cost against best cost */
2342
0
        if (mightSplit && !skipRecursion)
2343
0
            checkBestMode(md.pred[PRED_SPLIT], depth);
2344
2345
0
        if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
2346
0
        {
2347
0
            int cuIdx = (cuGeom.childOffset - 1) / 3;
2348
0
            cacheCost[cuIdx] = md.bestMode->rdCost;
2349
0
        }
2350
2351
        /* determine which motion references the parent CU should search */
2352
0
        splitCUData.initSplitCUData();
2353
0
        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2354
0
        {
2355
0
            if (md.bestMode == &md.pred[PRED_SPLIT])
2356
0
                splitCUData.splitRefs = allSplitRefs;
2357
0
            else
2358
0
            {
2359
                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2360
0
                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2361
0
                uint32_t numPU = cu.getNumPartInter(0);
2362
0
                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2363
0
                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2364
0
            }
2365
0
        }
2366
2367
0
        if (m_param->limitModes)
2368
0
        {
2369
0
            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2370
0
            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2371
0
            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2372
0
        }
2373
2374
        /* Copy best data to encData CTU and recon */
2375
0
        md.bestMode->cu.copyToPic(depth);
2376
0
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
2377
0
    }
2378
0
    else
2379
0
    {
2380
0
        if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
2381
0
        {
2382
0
            qprdRefine(parentCTU, cuGeom, qp, qp);
2383
2384
0
            SplitData splitData[4];
2385
0
            splitData[0].initSplitCUData();
2386
0
            splitData[1].initSplitCUData();
2387
0
            splitData[2].initSplitCUData();
2388
0
            splitData[3].initSplitCUData();
2389
2390
0
            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2391
2392
0
            splitCUData.initSplitCUData();
2393
0
            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2394
0
            {
2395
0
                if (md.bestMode == &md.pred[PRED_SPLIT])
2396
0
                    splitCUData.splitRefs = allSplitRefs;
2397
0
                else
2398
0
                {
2399
                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2400
0
                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2401
0
                    uint32_t numPU = cu.getNumPartInter(0);
2402
0
                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2403
0
                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2404
0
                }
2405
0
            }
2406
2407
0
            if (m_param->limitModes)
2408
0
            {
2409
0
                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2410
0
                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2411
0
                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2412
0
            }
2413
0
        }
2414
0
    }
2415
2416
0
    return splitCUData;
2417
0
}
2418
2419
void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
2420
0
{
2421
0
    uint32_t depth = cuGeom.depth;
2422
0
    ModeDepth& md = m_modeDepth[depth];
2423
0
    md.bestMode = NULL;
2424
2425
0
    m_evaluateInter = 0;
2426
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2427
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2428
0
    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2429
0
    int split = 0;
2430
2431
0
    TrainingData td;
2432
0
    td.init(parentCTU, cuGeom);
2433
2434
0
    if (!m_param->bDynamicRefine)
2435
0
        m_refineLevel = m_param->interRefine;
2436
0
    else
2437
0
        m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
2438
2439
0
    if (m_param->interRefine == 1)
2440
0
        split = (m_param->scaleFactor && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && (!mightNotSplit ||
2441
0
                (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2442
0
    else
2443
0
        split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
2444
0
                (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2445
0
    td.split = split;
2446
2447
0
    if ((bDecidedDepth && mightNotSplit) || (m_param->bAnalysisType == HEVC_INFO && parentCTU.m_cuDepth[cuGeom.absPartIdx] == 4))
2448
0
    {
2449
0
        setLambdaFromQP(parentCTU, qp, lqp);
2450
2451
0
        Mode& mode = md.pred[0];
2452
0
        md.bestMode = &mode;
2453
0
        mode.cu.initSubCU(parentCTU, cuGeom, qp);
2454
0
        PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
2455
0
        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2456
0
        {
2457
0
            if (m_param->intraRefine == 4)
2458
0
                compressIntraCU(parentCTU, cuGeom, qp);
2459
0
            else
2460
0
            {
2461
0
                bool reuseModes = !((m_param->intraRefine == 3) ||
2462
0
                    (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
2463
0
                if (reuseModes)
2464
0
                {
2465
0
                    memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2466
0
                    memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
2467
0
                }
2468
0
                checkIntra(mode, cuGeom, size);
2469
0
            }
2470
0
        }
2471
0
        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2472
0
        {
2473
0
            mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
2474
0
            uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
2475
0
            for (uint32_t part = 0; part < numPU; part++)
2476
0
            {
2477
0
                PredictionUnit pu(mode.cu, cuGeom, part);
2478
0
                if (m_param->analysisLoadReuseLevel == 10 || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7))
2479
0
                {
2480
0
                    x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
2481
0
                    int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
2482
0
                    mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
2483
0
                    mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
2484
0
                    for (int list = 0; list < m_slice->isInterB() + 1; list++)
2485
0
                    {
2486
0
                        mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
2487
0
                        mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
2488
0
                        mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
2489
0
                    }
2490
0
                    if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
2491
0
                    {
2492
0
                        if (m_param->interRefine == 1)
2493
0
                            m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
2494
                        //AMVP
2495
0
                        MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2496
0
                        mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours);
2497
0
                        for (int list = 0; list < m_slice->isInterB() + 1; list++)
2498
0
                        {
2499
0
                            int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
2500
0
                            if (ref == -1)
2501
0
                                continue;
2502
0
                            MV mvp;
2503
2504
0
                            int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
2505
0
                            mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
2506
0
                            if (m_param->interRefine == 1)
2507
0
                            {
2508
0
                                MV outmv, mvpSelect[3];
2509
0
                                mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word;
2510
0
                                if (m_param->mvRefine > 1)
2511
0
                                {
2512
0
                                    mvpSelect[1] = mvp;
2513
0
                                    if(m_param->mvRefine > 2)
2514
0
                                        mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])];
2515
0
                                }
2516
0
                                searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc);
2517
0
                                mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
2518
0
                            }
2519
0
                            mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
2520
0
                        }
2521
0
                    }
2522
0
                    else
2523
0
                    {
2524
0
                        MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2525
0
                        uint8_t candDir[MRG_MAX_NUM_CANDS];
2526
0
                        mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
2527
0
                        uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
2528
0
                        if (mode.cu.isBipredRestriction())
2529
0
                        {
2530
                            /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2531
0
                            if (candDir[mvpIdx] == 3)
2532
0
                            {
2533
0
                                candDir[mvpIdx] = 1;
2534
0
                                candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
2535
0
                            }
2536
0
                        }
2537
0
                        mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
2538
0
                        mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
2539
0
                        mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
2540
0
                        mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part);
2541
0
                        mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part);
2542
0
                    }
2543
0
                }
2544
0
                motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2545
0
            }
2546
0
            if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
2547
0
                encodeResAndCalcRdSkipCU(mode);
2548
0
            else
2549
0
                encodeResAndCalcRdInterCU(mode, cuGeom);
2550
2551
            /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
2552
0
            bool mergeInter2Nx2N = size == SIZE_2Nx2N && mode.cu.m_mergeFlag[0];
2553
0
            if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
2554
0
                checkDQP(mode, cuGeom);
2555
0
        }
2556
2557
0
        if (m_refineLevel < 2)
2558
0
        {
2559
0
            if (m_bTryLossless)
2560
0
                tryLossless(cuGeom);
2561
2562
0
            if (mightSplit)
2563
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
2564
2565
0
            if (mightSplit && m_param->rdLevel < 5)
2566
0
                checkDQPForSplitPred(*md.bestMode, cuGeom);
2567
0
        }
2568
2569
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2570
0
        {
2571
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
2572
0
            {
2573
0
                m_modeFlag[list] = true;
2574
0
                if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16)
2575
0
                    m_checkMergeAndSkipOnly[list] = true;
2576
0
            }
2577
0
            m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2578
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
2579
0
            {
2580
0
                m_modeFlag[list] = false;
2581
0
                m_checkMergeAndSkipOnly[list] = false;
2582
0
            }
2583
0
        }
2584
2585
0
        if (m_param->bDynamicRefine)
2586
0
            classifyCU(parentCTU,cuGeom, *md.bestMode, td);
2587
2588
0
        if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
2589
0
        {
2590
0
            if (parentCTU.m_cuDepth[cuGeom.absPartIdx] < 4 && mightNotSplit)
2591
0
                m_evaluateInter = 1;
2592
0
            else
2593
0
                bDecidedDepth = true;
2594
0
            m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2595
0
            m_evaluateInter = 0;
2596
0
        }
2597
0
    }
2598
0
    if (!bDecidedDepth || split)
2599
0
    {
2600
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
2601
0
        if (!split)
2602
0
            md.bestMode = splitPred;
2603
0
        splitPred->initCosts();
2604
0
        CUData* splitCU = &splitPred->cu;
2605
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
2606
2607
0
        uint32_t nextDepth = depth + 1;
2608
0
        ModeDepth& nd = m_modeDepth[nextDepth];
2609
0
        invalidateContexts(nextDepth);
2610
0
        Entropy* nextContext = &m_rqt[depth].cur;
2611
0
        int nextQP = qp;
2612
2613
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2614
0
        {
2615
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2616
0
            if (childGeom.flags & CUGeom::PRESENT)
2617
0
            {
2618
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2619
0
                m_rqt[nextDepth].cur.load(*nextContext);
2620
2621
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2622
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2623
2624
0
                int lamdaQP = (m_param->analysisLoadReuseLevel >= 7) ? nextQP : lqp;
2625
2626
0
                if (split)
2627
0
                    m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
2628
0
                else
2629
0
                    qprdRefine(parentCTU, childGeom, nextQP, lamdaQP);
2630
2631
                // Save best CU and pred data for this sub CU
2632
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2633
0
                splitPred->addSubCosts(*nd.bestMode);
2634
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2635
0
                nextContext = &nd.bestMode->contexts;
2636
0
            }
2637
0
            else
2638
0
            {
2639
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
2640
                // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
2641
0
                memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
2642
0
            }
2643
0
        }
2644
0
        nextContext->store(splitPred->contexts);
2645
0
        if (mightNotSplit)
2646
0
            addSplitFlagCost(*splitPred, cuGeom.depth);
2647
0
        else
2648
0
            updateModeCost(*splitPred);
2649
2650
0
        if (m_refineLevel)
2651
0
        {
2652
0
            if (m_param->rdLevel > 1)
2653
0
                checkBestMode(*splitPred, cuGeom.depth);
2654
0
            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
2655
0
                md.bestMode = splitPred;
2656
0
        }
2657
2658
0
        checkDQPForSplitPred(*splitPred, cuGeom);
2659
2660
        /* Copy best data to encData CTU and recon */
2661
0
        md.bestMode->cu.copyToPic(depth);
2662
0
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
2663
0
    }
2664
0
    if (m_param->bDynamicRefine && bDecidedDepth)
2665
0
        trainCU(parentCTU, cuGeom, *md.bestMode, td);
2666
0
}
2667
2668
void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
2669
0
{
2670
0
    uint32_t depth = cuGeom.depth;
2671
0
    trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
2672
0
    if (m_frame->m_classifyFrame)
2673
0
    {
2674
0
        uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
2675
0
        uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
2676
0
        float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
2677
0
        uint8_t varRefineLevel = 1;
2678
0
        uint8_t rdRefineLevel = 1;
2679
0
        uint64_t cuCost = bestMode.rdCost;
2680
0
        int offset = (depth * X265_REFINE_INTER_LEVELS);
2681
0
        if (cuCost < m_frame->m_classifyRd[offset])
2682
0
            m_refineLevel = 1;
2683
0
        else
2684
0
        {
2685
0
            uint64_t trainingCount = 0;
2686
0
            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
2687
0
            {
2688
0
                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
2689
0
                trainingCount += m_frame->m_classifyCount[offset];
2690
0
            }
2691
0
            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
2692
0
            {
2693
0
                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
2694
                /* Calculate distance values */
2695
0
                diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
2696
0
                diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
2697
2698
                /* Calculate prior probability - ranges between 0 and 1 */
2699
0
                if (trainingCount)
2700
0
                    probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
2701
2702
                /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
2703
                P(c|x) is the posterior probability of class given predictor.
2704
                P(c) is the prior probability of class.
2705
                P(x|c) is the likelihood which is the probability of predictor given class.
2706
                P(x) is the prior probability of predictor.*/
2707
0
                int curRefineLevel = m_refineLevel - 1;
2708
0
                if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
2709
0
                    varRefineLevel = i + 1;
2710
0
                if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
2711
0
                    rdRefineLevel = i + 1;
2712
0
            }
2713
0
            m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
2714
0
        }
2715
0
    }
2716
0
}
2717
2718
void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
2719
0
{
2720
0
    uint32_t depth = cuGeom.depth;
2721
0
    int classify = 1;
2722
0
    if (!m_frame->m_classifyFrame)
2723
0
    {
2724
        /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
2725
                          and CUs that has split.
2726
           classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
2727
           classify = 3 : CUs encoded as any other mode. */
2728
2729
0
        bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
2730
0
            trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
2731
0
            trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
2732
0
        bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
2733
0
        if (refineInter0 || refineInter1)
2734
0
            classify = 1;
2735
0
        else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
2736
0
            classify = 2;
2737
0
        else
2738
0
            classify = 3;
2739
0
    }
2740
0
    else
2741
0
        classify = m_refineLevel;
2742
0
    uint64_t cuCost = bestMode.rdCost;
2743
0
    int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
2744
0
    ctu.m_collectCURd[offset] += cuCost;
2745
0
    ctu.m_collectCUVariance[offset] += trainData.cuVariance;
2746
0
    ctu.m_collectCUCount[offset]++;
2747
0
}
2748
2749
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
2750
void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
2751
0
{
2752
0
    uint32_t depth = cuGeom.depth;
2753
0
    ModeDepth& md = m_modeDepth[depth];
2754
0
    Yuv *fencYuv = &md.fencYuv;
2755
2756
    /* Note that these two Mode instances are named MERGE and SKIP but they may
2757
     * hold the reverse when the function returns. We toggle between the two modes */
2758
0
    Mode* tempPred = &merge;
2759
0
    Mode* bestPred = &skip;
2760
2761
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
2762
2763
0
    tempPred->initCosts();
2764
0
    tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
2765
0
    tempPred->cu.setPredModeSubParts(MODE_INTER);
2766
0
    tempPred->cu.m_mergeFlag[0] = true;
2767
2768
0
    bestPred->initCosts();
2769
0
    bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
2770
0
    bestPred->cu.setPredModeSubParts(MODE_INTER);
2771
0
    bestPred->cu.m_mergeFlag[0] = true;
2772
2773
0
    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2774
0
    uint8_t candDir[MRG_MAX_NUM_CANDS];
2775
0
    uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
2776
0
    PredictionUnit pu(merge.cu, cuGeom, 0);
2777
2778
0
    bestPred->sa8dCost = MAX_INT64;
2779
0
    int bestSadCand = -1;
2780
0
    int sizeIdx = cuGeom.log2CUSize - 2;
2781
0
    int safeX, maxSafeMv;
2782
0
    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
2783
0
    {
2784
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2785
0
        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
2786
0
    }
2787
0
    for (uint32_t i = 0; i < numMergeCand; ++i)
2788
0
    {
2789
0
        if (m_bFrameParallel)
2790
0
        {
2791
            // Parallel slices bound check
2792
0
            if (m_param->maxSlices > 1)
2793
0
            {
2794
                // NOTE: First row in slice can't negative
2795
0
                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
2796
0
                    continue;
2797
2798
                // Last row in slice can't reference beyond bound since it is another slice area
2799
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2800
0
                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
2801
0
                    continue;
2802
0
            }
2803
2804
0
            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2805
0
                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
2806
0
                continue;
2807
0
        }
2808
2809
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2810
0
            tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
2811
0
            candMvField[i][0].mv.x > maxSafeMv)
2812
            // skip merge candidates which reference beyond safe reference area
2813
0
            continue;
2814
2815
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
2816
0
        X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
2817
0
        tempPred->cu.m_interDir[0] = candDir[i];
2818
0
        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2819
0
        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2820
0
        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2821
0
        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2822
0
        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
2823
2824
0
        tempPred->sa8dBits = getTUBits(i, numMergeCand);
2825
0
        tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
2826
0
        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
2827
0
        {
2828
0
            tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
2829
0
            tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
2830
0
        }
2831
0
        tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
2832
2833
0
        if (tempPred->sa8dCost < bestPred->sa8dCost)
2834
0
        {
2835
0
            bestSadCand = i;
2836
0
            std::swap(tempPred, bestPred);
2837
0
        }
2838
0
    }
2839
2840
    /* force mode decision to take inter or intra */
2841
0
    if (bestSadCand < 0)
2842
0
        return;
2843
2844
    /* calculate the motion compensation for chroma for the best mode selected */
2845
0
    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
2846
0
        motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
2847
2848
0
    if (m_param->rdLevel)
2849
0
    {
2850
0
        if (m_param->bLossless)
2851
0
            bestPred->rdCost = MAX_INT64;
2852
0
        else
2853
0
            encodeResAndCalcRdSkipCU(*bestPred);
2854
2855
        /* Encode with residual */
2856
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
2857
0
        tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
2858
0
        tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
2859
0
        tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
2860
0
        tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
2861
0
        tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
2862
0
        tempPred->sa8dCost = bestPred->sa8dCost;
2863
0
        tempPred->sa8dBits = bestPred->sa8dBits;
2864
0
        tempPred->predYuv.copyFromYuv(bestPred->predYuv);
2865
2866
0
        encodeResAndCalcRdInterCU(*tempPred, cuGeom);
2867
2868
0
        md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
2869
0
    }
2870
0
    else
2871
0
        md.bestMode = bestPred;
2872
2873
    /* broadcast sets of MV field data */
2874
0
    md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
2875
0
    md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
2876
0
    md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
2877
0
    md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
2878
0
    md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
2879
0
    checkDQP(*md.bestMode, cuGeom);
2880
0
}
2881
2882
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
2883
void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
2884
0
{
2885
0
    uint32_t depth = cuGeom.depth;
2886
2887
    /* Note that these two Mode instances are named MERGE and SKIP but they may
2888
     * hold the reverse when the function returns. We toggle between the two modes */
2889
0
    Mode* tempPred = &merge;
2890
0
    Mode* bestPred = &skip;
2891
2892
0
    merge.initCosts();
2893
0
    merge.cu.setPredModeSubParts(MODE_INTER);
2894
0
    merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
2895
0
    merge.cu.m_mergeFlag[0] = true;
2896
2897
0
    skip.initCosts();
2898
0
    skip.cu.setPredModeSubParts(MODE_INTER);
2899
0
    skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
2900
0
    skip.cu.m_mergeFlag[0] = true;
2901
2902
0
    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
2903
0
    uint8_t candDir[MRG_MAX_NUM_CANDS];
2904
0
    uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
2905
0
    PredictionUnit pu(merge.cu, cuGeom, 0);
2906
2907
0
    bool foundCbf0Merge = false;
2908
0
    bool triedPZero = false, triedBZero = false;
2909
0
    bestPred->rdCost = MAX_INT64;
2910
2911
0
    int safeX, maxSafeMv;
2912
0
    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
2913
0
    {
2914
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
2915
0
        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
2916
0
    }
2917
0
    for (uint32_t i = 0; i < numMergeCand; i++)
2918
0
    {
2919
0
        if (m_bFrameParallel)
2920
0
        {
2921
            // Parallel slices bound check
2922
0
            if (m_param->maxSlices > 1)
2923
0
            {
2924
                // NOTE: First row in slice can't negative
2925
0
                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
2926
0
                    continue;
2927
2928
                // Last row in slice can't reference beyond bound since it is another slice area
2929
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2930
0
                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
2931
0
                    continue;
2932
0
            }
2933
2934
0
            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2935
0
                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
2936
0
                continue;
2937
0
        }
2938
2939
        /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
2940
0
        if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
2941
0
        {
2942
0
            if (triedPZero)
2943
0
                continue;
2944
0
            triedPZero = true;
2945
0
        }
2946
0
        else if (candDir[i] == 3 &&
2947
0
            !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
2948
0
            !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
2949
0
        {
2950
0
            if (triedBZero)
2951
0
                continue;
2952
0
            triedBZero = true;
2953
0
        }
2954
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
2955
0
            tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
2956
0
            candMvField[i][0].mv.x > maxSafeMv)
2957
            // skip merge candidates which reference beyond safe reference area
2958
0
            continue;
2959
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
2960
0
        tempPred->cu.m_interDir[0] = candDir[i];
2961
0
        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2962
0
        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2963
0
        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2964
0
        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2965
0
        tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
2966
2967
0
        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
2968
2969
0
        uint8_t hasCbf = true;
2970
0
        bool swapped = false;
2971
0
        if (!foundCbf0Merge)
2972
0
        {
2973
            /* if the best prediction has CBF (not a skip) then try merge with residual */
2974
2975
0
            encodeResAndCalcRdInterCU(*tempPred, cuGeom);
2976
0
            hasCbf = tempPred->cu.getQtRootCbf(0);
2977
0
            foundCbf0Merge = !hasCbf;
2978
2979
0
            if (tempPred->rdCost < bestPred->rdCost)
2980
0
            {
2981
0
                std::swap(tempPred, bestPred);
2982
0
                swapped = true;
2983
0
            }
2984
0
        }
2985
0
        if (!m_param->bLossless && hasCbf)
2986
0
        {
2987
            /* try merge without residual (skip), if not lossless coding */
2988
2989
0
            if (swapped)
2990
0
            {
2991
0
                tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
2992
0
                tempPred->cu.m_interDir[0] = candDir[i];
2993
0
                tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
2994
0
                tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
2995
0
                tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
2996
0
                tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
2997
0
                tempPred->cu.setPredModeSubParts(MODE_INTER);
2998
0
                tempPred->predYuv.copyFromYuv(bestPred->predYuv);
2999
0
            }
3000
3001
0
            encodeResAndCalcRdSkipCU(*tempPred);
3002
3003
0
            if (tempPred->rdCost < bestPred->rdCost)
3004
0
                std::swap(tempPred, bestPred);
3005
0
        }
3006
0
    }
3007
3008
0
    if (bestPred->rdCost < MAX_INT64)
3009
0
    {
3010
0
        m_modeDepth[depth].bestMode = bestPred;
3011
3012
        /* broadcast sets of MV field data */
3013
0
        uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
3014
0
        bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
3015
0
        bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
3016
0
        bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
3017
0
        bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
3018
0
        bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
3019
0
        checkDQP(*bestPred, cuGeom);
3020
0
    }
3021
0
}
3022
3023
void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3024
0
{
3025
0
    interMode.initCosts();
3026
0
    interMode.cu.setPartSizeSubParts(partSize);
3027
0
    interMode.cu.setPredModeSubParts(MODE_INTER);
3028
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3029
3030
0
    if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3031
0
    {
3032
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3033
0
        int index = 0;
3034
3035
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3036
0
        for (uint32_t part = 0; part < numPU; part++)
3037
0
        {
3038
0
            MotionData* bestME = interMode.bestME[part];
3039
0
            for (int32_t i = 0; i < numPredDir; i++)
3040
0
                bestME[i].ref = m_reuseRef[refOffset + index++];
3041
0
        }
3042
0
    }
3043
3044
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3045
0
    {
3046
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3047
0
        for (uint32_t part = 0; part < numPU; part++)
3048
0
        {
3049
0
            MotionData* bestME = interMode.bestME[part];
3050
0
            for (int32_t i = 0; i < numPredDir; i++)
3051
0
            {
3052
0
                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3053
0
                bestME[i].ref = ref[cuGeom.absPartIdx];
3054
0
                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3055
0
                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3056
0
            }
3057
0
        }
3058
0
    }
3059
0
    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
3060
3061
    /* predInterSearch sets interMode.sa8dBits */
3062
0
    const Yuv& fencYuv = *interMode.fencYuv;
3063
0
    Yuv& predYuv = interMode.predYuv;
3064
0
    int part = partitionFromLog2Size(cuGeom.log2CUSize);
3065
0
    interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
3066
0
    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3067
0
    {
3068
0
        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
3069
0
        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
3070
0
    }
3071
0
    interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
3072
3073
0
    if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3074
0
    {
3075
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3076
0
        int index = 0;
3077
3078
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3079
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3080
0
        {
3081
0
            MotionData* bestME = interMode.bestME[puIdx];
3082
0
            for (int32_t i = 0; i < numPredDir; i++)
3083
0
                m_reuseRef[refOffset + index++] = bestME[i].ref;
3084
0
        }
3085
0
    }
3086
0
}
3087
3088
void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3089
0
{
3090
0
    interMode.initCosts();
3091
0
    interMode.cu.setPartSizeSubParts(partSize);
3092
0
    interMode.cu.setPredModeSubParts(MODE_INTER);
3093
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3094
3095
0
    if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3096
0
    {
3097
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3098
0
        int index = 0;
3099
3100
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3101
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3102
0
        {
3103
0
            MotionData* bestME = interMode.bestME[puIdx];
3104
0
            for (int32_t i = 0; i < numPredDir; i++)
3105
0
                bestME[i].ref = m_reuseRef[refOffset + index++];
3106
0
        }
3107
0
    }
3108
3109
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3110
0
    {
3111
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3112
0
        for (uint32_t part = 0; part < numPU; part++)
3113
0
        {
3114
0
            MotionData* bestME = interMode.bestME[part];
3115
0
            for (int32_t i = 0; i < numPredDir; i++)
3116
0
            {
3117
0
                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3118
0
                bestME[i].ref = ref[cuGeom.absPartIdx];
3119
0
                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3120
0
                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3121
0
            }
3122
0
        }
3123
0
    }
3124
3125
0
    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
3126
3127
    /* predInterSearch sets interMode.sa8dBits, but this is ignored */
3128
0
    encodeResAndCalcRdInterCU(interMode, cuGeom);
3129
3130
0
    if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3131
0
    {
3132
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3133
0
        int index = 0;
3134
3135
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3136
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3137
0
        {
3138
0
            MotionData* bestME = interMode.bestME[puIdx];
3139
0
            for (int32_t i = 0; i < numPredDir; i++)
3140
0
                m_reuseRef[refOffset + index++] = bestME[i].ref;
3141
0
        }
3142
0
    }
3143
0
}
3144
3145
void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
3146
0
{
3147
0
    CUData& cu = bidir2Nx2N.cu;
3148
3149
0
    if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3150
0
    {
3151
0
        bidir2Nx2N.sa8dCost = MAX_INT64;
3152
0
        bidir2Nx2N.rdCost = MAX_INT64;
3153
0
        return;
3154
0
    }
3155
3156
0
    const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
3157
0
    MV   mvzero(0, 0);
3158
0
    int  partEnum = cuGeom.log2CUSize - 2;
3159
3160
0
    bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
3161
0
    bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
3162
0
    MotionData* bestME = bidir2Nx2N.bestME[0];
3163
0
    int ref0    = bestME[0].ref;
3164
0
    MV  mvp0    = bestME[0].mvp;
3165
0
    int mvpIdx0 = bestME[0].mvpIdx;
3166
0
    int ref1    = bestME[1].ref;
3167
0
    MV  mvp1    = bestME[1].mvp;
3168
0
    int mvpIdx1 = bestME[1].mvpIdx;
3169
3170
0
    bidir2Nx2N.initCosts();
3171
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
3172
0
    cu.setPredModeSubParts(MODE_INTER);
3173
0
    cu.setPUInterDir(3, 0, 0);
3174
0
    cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
3175
0
    cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
3176
0
    cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3177
0
    cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3178
0
    cu.m_mergeFlag[0] = 0;
3179
3180
    /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
3181
0
    cu.setPUMv(0, bestME[0].mv, 0, 0);
3182
0
    cu.m_mvd[0][0] = bestME[0].mv - mvp0;
3183
3184
0
    cu.setPUMv(1, bestME[1].mv, 0, 0);
3185
0
    cu.m_mvd[1][0] = bestME[1].mv - mvp1;
3186
3187
0
    PredictionUnit pu(cu, cuGeom, 0);
3188
0
    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3189
3190
0
    int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
3191
0
    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3192
0
    {
3193
        /* Add in chroma distortion */
3194
0
        sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
3195
0
        sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
3196
0
    }
3197
0
    bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3198
0
    bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
3199
3200
0
    bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
3201
0
    if (bTryZero)
3202
0
    {
3203
        /* Do not try zero MV if unidir motion predictors are beyond
3204
         * valid search area */
3205
0
        MV mvmin, mvmax;
3206
0
        int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
3207
0
        setSearchRange(cu, mvzero, merange, mvmin, mvmax);
3208
0
        mvmax.y += 2; // there is some pad for subpel refine
3209
0
        mvmin <<= 2;
3210
0
        mvmax <<= 2;
3211
3212
0
        bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
3213
0
        bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
3214
0
    }
3215
0
    if (bTryZero)
3216
0
    {
3217
        /* Estimate cost of BIDIR using coincident blocks */
3218
0
        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3219
3220
0
        int zsa8d;
3221
3222
0
        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3223
0
        {
3224
0
            cu.m_mv[0][0] = mvzero;
3225
0
            cu.m_mv[1][0] = mvzero;
3226
3227
0
            motionCompensation(cu, pu, tmpPredYuv, true, true);
3228
0
            zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3229
0
            zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
3230
0
            zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
3231
3232
0
        }
3233
0
        else
3234
0
        {
3235
0
            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3236
0
            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3237
0
            intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
3238
0
            primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
3239
0
            zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3240
0
        }
3241
0
        uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
3242
0
        uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
3243
0
        uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
3244
3245
        /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
3246
0
        mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
3247
0
        mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
3248
3249
0
        uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3250
0
        zcost = zsa8d + m_rdCost.getCost(zbits);
3251
3252
0
        if (zcost < bidir2Nx2N.sa8dCost)
3253
0
        {
3254
0
            bidir2Nx2N.sa8dBits = zbits;
3255
0
            bidir2Nx2N.sa8dCost = zcost;
3256
3257
0
            cu.setPUMv(0, mvzero, 0, 0);
3258
0
            cu.m_mvd[0][0] = mvzero - mvp0;
3259
0
            cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3260
3261
0
            cu.setPUMv(1, mvzero, 0, 0);
3262
0
            cu.m_mvd[1][0] = mvzero - mvp1;
3263
0
            cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3264
3265
0
            if (m_bChromaSa8d) /* real MC was already performed */
3266
0
                bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
3267
0
            else
3268
0
                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400);
3269
0
        }
3270
0
        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3271
0
        {
3272
            /* recover overwritten motion vectors */
3273
0
            cu.m_mv[0][0] = bestME[0].mv;
3274
0
            cu.m_mv[1][0] = bestME[1].mv;
3275
0
        }
3276
0
    }
3277
0
}
3278
3279
void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
3280
0
{
3281
0
    if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth)
3282
0
    {
3283
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
3284
0
        {
3285
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
3286
0
            if (childGeom.flags & CUGeom::PRESENT)
3287
0
                encodeResidue(ctu, childGeom);
3288
0
        }
3289
0
        return;
3290
0
    }
3291
3292
0
    uint32_t absPartIdx = cuGeom.absPartIdx;
3293
0
    int sizeIdx = cuGeom.log2CUSize - 2;
3294
3295
    /* reuse the bestMode data structures at the current depth */
3296
0
    Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
3297
0
    CUData& cu = bestMode->cu;
3298
3299
0
    cu.copyFromPic(ctu, cuGeom, m_csp);
3300
3301
0
    PicYuv& reconPic = *m_frame->m_reconPic;
3302
3303
0
    Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
3304
0
    if (cuGeom.depth)
3305
0
        m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
3306
0
    X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
3307
3308
0
    if (cu.isIntra(0))
3309
0
    {
3310
0
        ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
3311
        
3312
0
        uint32_t tuDepthRange[2];
3313
0
        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
3314
3315
0
        residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
3316
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3317
0
        {
3318
0
            getBestIntraModeChroma(*bestMode, cuGeom);
3319
0
            residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
3320
0
        }
3321
0
    }
3322
0
    else // if (cu.isInter(0))
3323
0
    {
3324
0
        ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
3325
3326
0
        X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
3327
3328
        /* Calculate residual for current CU part into depth sized resiYuv */
3329
3330
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
3331
3332
        /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
3333
0
        Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
3334
0
        pixel* predY = predYuv.getLumaAddr(absPartIdx);
3335
3336
0
        primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
3337
0
                                      fencYuv.m_buf[0], predY,
3338
0
                                      fencYuv.m_size, predYuv.m_size);
3339
3340
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3341
0
        {
3342
0
            pixel* predU = predYuv.getCbAddr(absPartIdx);
3343
0
            pixel* predV = predYuv.getCrAddr(absPartIdx);
3344
0
            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
3345
0
                                                 fencYuv.m_buf[1], predU,
3346
0
                                                 fencYuv.m_csize, predYuv.m_csize);
3347
3348
0
            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
3349
0
                                                 fencYuv.m_buf[2], predV,
3350
0
                                                 fencYuv.m_csize, predYuv.m_csize);
3351
0
        }
3352
3353
0
        uint32_t tuDepthRange[2];
3354
0
        cu.getInterTUQtDepthRange(tuDepthRange, 0);
3355
3356
0
        residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
3357
3358
0
        if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
3359
0
            cu.setPredModeSubParts(MODE_SKIP);
3360
3361
        /* residualTransformQuantInter() wrote transformed residual back into
3362
         * resiYuv. Generate the recon pixels by adding it to the prediction */
3363
3364
0
        if (cu.m_cbf[0][0])
3365
0
        {
3366
0
            bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
3367
0
            bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
3368
0
            primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
3369
0
                (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
3370
0
        }
3371
0
        else
3372
0
            primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
3373
0
                                           predY, predYuv.m_size);
3374
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3375
0
        {
3376
0
             pixel* predU = predYuv.getCbAddr(absPartIdx);
3377
0
             pixel* predV = predYuv.getCrAddr(absPartIdx);
3378
0
             if (cu.m_cbf[1][0])
3379
0
             {
3380
0
                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3381
0
                 bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3382
0
                 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3383
0
                     (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
3384
0
             }
3385
0
            else
3386
0
                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3387
0
                                                         predU, predYuv.m_csize);
3388
3389
0
            if (cu.m_cbf[2][0])
3390
0
            {
3391
0
                bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
3392
0
                bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
3393
0
                primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
3394
0
                    (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
3395
0
            }
3396
0
            else
3397
0
                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
3398
0
                                                         predV, predYuv.m_csize);
3399
0
        }
3400
0
    }
3401
3402
0
    cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp);
3403
0
}
3404
3405
void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
3406
0
{
3407
0
    if (m_param->rdLevel >= 3)
3408
0
    {
3409
        /* code the split flag (0 or 1) and update bit costs */
3410
0
        mode.contexts.resetBits();
3411
0
        mode.contexts.codeSplitFlag(mode.cu, 0, depth);
3412
0
        uint32_t bits = mode.contexts.getNumberOfWrittenBits();
3413
0
        mode.totalBits += bits;
3414
0
        updateModeCost(mode);
3415
0
    }
3416
0
    else if (m_param->rdLevel <= 1)
3417
0
    {
3418
0
        mode.sa8dBits++;
3419
0
        mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
3420
0
    }
3421
0
    else
3422
0
    {
3423
0
        mode.totalBits++;
3424
0
        updateModeCost(mode);
3425
0
    }
3426
0
}
3427
3428
uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
3429
0
{
3430
    /* Do not attempt to code a block larger than the largest block in the
3431
     * co-located CTUs in L0 and L1 */
3432
0
    int currentQP = parentCTU.m_qp[0];
3433
0
    int previousQP = currentQP;
3434
0
    uint32_t minDepth0 = 4, minDepth1 = 4;
3435
0
    uint32_t sum = 0;
3436
0
    int numRefs = 0;
3437
0
    if (m_slice->m_numRefIdx[0])
3438
0
    {
3439
0
        numRefs++;
3440
0
        const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3441
0
        previousQP = cu.m_qp[0];
3442
0
        if (!cu.m_cuDepth[cuGeom.absPartIdx])
3443
0
            return 0;
3444
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3445
0
        {
3446
0
            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3447
0
            minDepth0 = X265_MIN(d, minDepth0);
3448
0
            sum += d;
3449
0
        }
3450
0
    }
3451
0
    if (m_slice->m_numRefIdx[1])
3452
0
    {
3453
0
        numRefs++;
3454
0
        const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
3455
0
        if (!cu.m_cuDepth[cuGeom.absPartIdx])
3456
0
            return 0;
3457
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
3458
0
        {
3459
0
            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
3460
0
            minDepth1 = X265_MIN(d, minDepth1);
3461
0
            sum += d;
3462
0
        }
3463
0
    }
3464
0
    if (!numRefs)
3465
0
        return 0;
3466
3467
0
    uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
3468
0
    uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
3469
3470
    /* allow block size growth if QP is raising or avg depth is
3471
     * less than 1.5 of min depth */
3472
0
    if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
3473
0
        minDepth -= 1;
3474
3475
0
    return minDepth;
3476
0
}
3477
3478
/* returns true if recursion should be stopped */
3479
bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
3480
0
{
3481
    /* early exit when the RD cost of best mode at depth n is less than the sum
3482
     * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
3483
     * left, colocated) and avg cost of that CU at depth "n" with weightage for
3484
     * each quantity */
3485
3486
0
    uint32_t depth = cuGeom.depth;
3487
0
    FrameData& curEncData = *m_frame->m_encData;
3488
0
    FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
3489
0
    uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
3490
0
    uint64_t cuCount = cuStat.count[depth];
3491
3492
0
    uint64_t neighCost = 0, neighCount = 0;
3493
0
    const CUData* above = parentCTU.m_cuAbove;
3494
0
    if (above)
3495
0
    {
3496
0
        FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
3497
0
        neighCost += astat.avgCost[depth] * astat.count[depth];
3498
0
        neighCount += astat.count[depth];
3499
3500
0
        const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
3501
0
        if (aboveLeft)
3502
0
        {
3503
0
            FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
3504
0
            neighCost += lstat.avgCost[depth] * lstat.count[depth];
3505
0
            neighCount += lstat.count[depth];
3506
0
        }
3507
3508
0
        const CUData* aboveRight = parentCTU.m_cuAboveRight;
3509
0
        if (aboveRight)
3510
0
        {
3511
0
            FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
3512
0
            neighCost += rstat.avgCost[depth] * rstat.count[depth];
3513
0
            neighCount += rstat.count[depth];
3514
0
        }
3515
0
    }
3516
0
    const CUData* left = parentCTU.m_cuLeft;
3517
0
    if (left)
3518
0
    {
3519
0
        FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
3520
0
        neighCost += nstat.avgCost[depth] * nstat.count[depth];
3521
0
        neighCount += nstat.count[depth];
3522
0
    }
3523
3524
    // give 60% weight to all CU's and 40% weight to neighbour CU's
3525
0
    if (neighCount + cuCount)
3526
0
    {
3527
0
        uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
3528
0
        uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
3529
0
        if (curCost < avgCost && avgCost)
3530
0
            return true;
3531
0
    }
3532
3533
0
    return false;
3534
0
}
3535
3536
bool Analysis::complexityCheckCU(const Mode& bestMode)
3537
0
{
3538
0
    if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
3539
0
    {
3540
0
        uint32_t mean = 0;
3541
0
        uint32_t homo = 0;
3542
0
        uint32_t cuSize = bestMode.fencYuv->m_size;
3543
0
        for (uint32_t y = 0; y < cuSize; y++) {
3544
0
            for (uint32_t x = 0; x < cuSize; x++) {
3545
0
                mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
3546
0
            }
3547
0
        }
3548
0
        mean = mean / (cuSize * cuSize);
3549
0
        for (uint32_t y = 0; y < cuSize; y++) {
3550
0
            for (uint32_t x = 0; x < cuSize; x++) {
3551
0
                homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
3552
0
            }
3553
0
        }
3554
0
        homo = homo / (cuSize * cuSize);
3555
3556
0
        if (homo < (.1 * mean))
3557
0
            return true;
3558
3559
0
        return false;
3560
0
    }
3561
0
    else
3562
0
    {
3563
0
        int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
3564
0
        int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
3565
0
        intptr_t stride = m_frame->m_fencPic->m_stride;
3566
0
        intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
3567
0
        uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
3568
0
        uint32_t sum = (uint32_t)sum_ss;
3569
0
        uint32_t ss = (uint32_t)(sum_ss >> 32);
3570
0
        uint32_t pixelCount = 1 << shift;
3571
0
        double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
3572
3573
0
        if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
3574
0
            return false;
3575
0
        else
3576
0
            return true;
3577
0
    }
3578
0
 }
3579
3580
uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
3581
0
{
3582
0
    uint32_t cuVariance = 0;
3583
0
    uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
3584
0
    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
3585
3586
0
    uint32_t width = m_frame->m_fencPic->m_picWidth;
3587
0
    uint32_t height = m_frame->m_fencPic->m_picHeight;
3588
0
    uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
3589
0
    uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
3590
0
    uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
3591
0
    uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
3592
0
    uint32_t cnt = 0; 
3593
3594
0
    for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
3595
0
    {
3596
0
        for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
3597
0
        {
3598
0
            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
3599
0
            cuVariance += blockVariance[idx];
3600
0
            cnt++;
3601
0
        }
3602
0
    }
3603
0
    return cuVariance / cnt;
3604
0
}
3605
3606
double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom)
3607
0
{
3608
0
    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
3609
0
    PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
3610
3611
0
    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth;
3612
0
    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight;
3613
3614
0
    uint32_t aqStride = pQPLayer->numAQPartInWidth;
3615
3616
0
    double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX];
3617
0
    return dQpOffset;
3618
0
}
3619
3620
double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom)
3621
0
{
3622
0
    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
3623
0
    PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
3624
3625
0
    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth;
3626
0
    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight;
3627
3628
0
    uint32_t aqStride = pcAQLayer->numAQPartInWidth;
3629
3630
0
    double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX];
3631
0
    return dQpOffset;
3632
0
}
3633
3634
int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
3635
0
{
3636
0
    FrameData& curEncData = *m_frame->m_encData;
3637
0
    double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
3638
0
    bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
3639
3640
0
    if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && m_param->analysisLoad))
3641
0
    {
3642
0
        x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
3643
0
        if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
3644
0
            && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
3645
0
            qp += distortionData->offset[ctu.m_cuAddr];
3646
0
    }
3647
3648
0
    if (m_param->analysisLoadReuseLevel == 10 && m_param->rc.cuTree)
3649
0
    {
3650
0
        int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
3651
0
        if (ctu.m_slice->m_sliceType == I_SLICE)
3652
0
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
3653
0
        else
3654
0
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
3655
0
    }
3656
0
    if (m_param->rc.hevcAq)
3657
0
    {
3658
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
3659
0
        double dQpOffset = 0;
3660
0
        if (bCuTreeOffset)
3661
0
        {
3662
0
            dQpOffset = cuTreeQPOffset(ctu, cuGeom);
3663
0
        }
3664
0
        else
3665
0
        {
3666
0
            dQpOffset = aqQPOffset(ctu, cuGeom);
3667
0
            if (complexCheck)
3668
0
            {
3669
0
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
3670
0
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
3671
0
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
3672
0
                return (offset < max_threshold);
3673
0
            }
3674
0
        }
3675
0
        qp += dQpOffset;
3676
0
    }
3677
0
    else
3678
0
    {
3679
0
        int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
3680
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
3681
0
        double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
3682
0
        if (qpoffs)
3683
0
        {
3684
0
            uint32_t width = m_frame->m_fencPic->m_picWidth;
3685
0
            uint32_t height = m_frame->m_fencPic->m_picHeight;
3686
0
            uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
3687
0
            uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
3688
0
            uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
3689
0
            uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
3690
0
            double dQpOffset = 0;
3691
0
            uint32_t cnt = 0;
3692
0
            for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
3693
0
            {
3694
0
                for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
3695
0
                {
3696
0
                    uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
3697
0
                    dQpOffset += qpoffs[idx];
3698
0
                    cnt++;
3699
0
                }
3700
0
            }
3701
0
            dQpOffset /= cnt;
3702
0
            qp += dQpOffset;
3703
0
            if (complexCheck)
3704
0
            {
3705
0
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
3706
0
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
3707
0
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
3708
0
                return (offset < max_threshold);
3709
0
            }
3710
0
        }
3711
0
    }
3712
3713
0
    return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
3714
0
}
3715
3716
void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype)
3717
0
{
3718
0
    static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
3719
0
    static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
3720
0
    int shift = (X265_DEPTH - 8);
3721
3722
0
    double s = 1 + 0.005 * qp;
3723
3724
    // Calculate denominator of normalization factor
3725
0
    uint64_t fDc_den = 0, fAc_den = 0;
3726
3727
    // 1. Calculate dc component
3728
0
    uint64_t z_o = 0;
3729
0
    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4)
3730
0
    {
3731
0
        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4)
3732
0
        {
3733
0
            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
3734
0
            z_o += temp * temp; // 2 * (Z(0)) pow(2)
3735
0
        }
3736
0
    }
3737
0
    fDc_den = (2 * z_o)  + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1
3738
0
    fDc_den /= ((blockSize >> 2) * (blockSize >> 2));
3739
3740
    // 2. Calculate ac component
3741
0
    uint64_t z_k = 0;
3742
0
    int block = (int)(((log(blockSize) / log(2)) - 2) + 0.5);
3743
0
    primitives.cu[block].normFact(src, blockSize, shift, &z_k);
3744
3745
    // Remove the DC part
3746
0
    z_k -= z_o;
3747
3748
0
    fAc_den = z_k + int(s * z_k) + ssim_c2;
3749
0
    fAc_den /= ((blockSize >> 2) * (blockSize >> 2));
3750
3751
0
    ctu.m_fAc_den[ttype] = fAc_den;
3752
0
    ctu.m_fDc_den[ttype] = fDc_den;
3753
0
}
3754
3755
void Analysis::calculateNormFactor(CUData& ctu, int qp)
3756
0
{
3757
0
    const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0];
3758
0
    uint32_t blockSize = m_modeDepth[0].fencYuv.m_size;
3759
3760
0
    normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA);
3761
3762
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3763
0
    {
3764
0
        const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1];
3765
0
        const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2];
3766
0
        uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize;
3767
3768
0
        normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U);
3769
0
        normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V);
3770
0
    }
3771
0
}
3772
3773
int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom)
3774
0
{
3775
0
    int sameContentRef = 0;
3776
0
    int m_curPoc = parentCTU.m_slice->m_poc;
3777
0
    int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx];
3778
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3779
0
    for (int list = 0; list < numPredDir; list++)
3780
0
    {
3781
0
        for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
3782
0
        {
3783
0
            int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc;
3784
0
            int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx];
3785
0
            if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE)))
3786
0
                sameContentRef++;    /* Content changed */
3787
0
        }
3788
0
    }
3789
0
    return sameContentRef;
3790
0
}