Coverage Report

Created: 2026-03-08 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/analysis.cpp
Line
Count
Source
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
*          Steve Borho <steve@borho.org>
6
*          Min Chen <chenm003@163.com>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at license @ x265.com.
24
*****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "picyuv.h"
30
#include "primitives.h"
31
#include "threading.h"
32
33
#include "analysis.h"
34
#include "rdcost.h"
35
#include "encoder.h"
36
37
using namespace X265_NS;
38
39
/* An explanation of rate distortion levels (--rd-level)
40
 *
41
 * rd-level 0 generates no recon per CU (NO RDO or Quant)
42
 *
43
 *   sa8d selection between merge / skip / inter / intra and split
44
 *   no recon pixels generated until CTU analysis is complete, requiring
45
 *   intra predictions to use source pixels
46
 *
47
 * rd-level 1 uses RDO for merge and skip, sa8d for all else
48
 *
49
 *   RDO selection between merge and skip
50
 *   sa8d selection between (merge/skip) / inter modes / intra and split
51
 *   intra prediction uses reconstructed pixels
52
 *
53
 * rd-level 2 uses RDO for merge/skip and split
54
 *
55
 *   RDO selection between merge and skip
56
 *   sa8d selection between (merge/skip) / inter modes / intra
57
 *   RDO split decisions
58
 *
59
 * rd-level 3 uses RDO for merge/skip/best inter/intra
60
 *
61
 *   RDO selection between merge and skip
62
 *   sa8d selection of best inter mode
63
 *   sa8d decisions include chroma residual cost
64
 *   RDO selection between (merge/skip) / best inter mode / intra / split
65
 *
66
 * rd-level 4 enables RDOQuant
67
 *   chroma residual cost included in satd decisions, including subpel refine
68
 *    (as a result of --subme 3 being used by preset slow)
69
 *
70
 * rd-level 5,6 does RDO for each inter mode
71
 */
72
73
Analysis::Analysis()
74
21.2k
{
75
21.2k
    m_bTryLossless = false;
76
21.2k
    m_bChromaSa8d = false;
77
21.2k
    m_bHD = false;
78
79
21.2k
    memset(m_modeFlag, 0, sizeof(m_modeFlag));
80
21.2k
    memset(m_checkMergeAndSkipOnly, 0, sizeof(m_checkMergeAndSkipOnly));
81
82
106k
    for (int i = 0; i < NUM_CU_DEPTH; i++)
83
85.1k
    {
84
85.1k
        m_modeDepth[i].bestMode = NULL;
85
85.1k
        memset(m_modeDepth[i].pred, 0, sizeof(m_modeDepth[i].pred));
86
85.1k
    }
87
88
21.2k
    m_reuseInterDataCTU = NULL;
89
21.2k
    m_reuseRef = NULL;
90
21.2k
    m_reuseDepth = NULL;
91
21.2k
    m_reuseModes = NULL;
92
21.2k
    m_reusePartSize = NULL;
93
21.2k
    m_reuseMergeFlag = NULL;
94
21.2k
    m_reuseMv[0] = NULL;
95
21.2k
    m_reuseMv[1] = NULL;
96
21.2k
    m_reuseMvpIdx[0] = NULL;
97
21.2k
    m_reuseMvpIdx[1] = NULL;
98
21.2k
    cacheCost = NULL;
99
21.2k
    m_additionalCtuInfo = NULL;
100
21.2k
    m_prevCtuInfoChange = NULL;
101
102
21.2k
    m_evaluateInter = 0;
103
21.2k
    m_refineLevel = 0;
104
105
21.2k
    memset(m_splitRefIdx, 0, sizeof(m_splitRefIdx));
106
21.2k
}
107
108
bool Analysis::create(ThreadLocalData *tld)
109
21.2k
{
110
21.2k
    m_tld = tld;
111
21.2k
    m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
112
113
21.2k
    int costArrSize = 1;
114
21.2k
    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
115
31.0k
    for (uint32_t i = 1; i <= maxDQPDepth; i++)
116
9.75k
        costArrSize += (1 << (i * 2));
117
21.2k
    cacheCost = X265_MALLOC(uint64_t, costArrSize);
118
119
21.2k
    int csp = m_param->internalCsp;
120
21.2k
    uint32_t cuSize = m_param->maxCUSize;
121
122
21.2k
    bool ok = true;
123
94.0k
    for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1)
124
72.7k
    {
125
72.7k
        ModeDepth &md = m_modeDepth[depth];
126
72.7k
        ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param);
127
72.7k
        ok &= md.fencYuv.create(cuSize, csp);
128
72.7k
        if (ok)
129
72.7k
        {
130
1.09M
            for (int j = 0; j < MAX_PRED_TYPES; j++)
131
1.01M
            {
132
1.01M
                md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j);
133
1.01M
                ok &= md.pred[j].predYuv.create(cuSize, csp);
134
1.01M
                ok &= md.pred[j].reconYuv.create(cuSize, csp);
135
1.01M
                md.pred[j].fencYuv = &md.fencYuv;
136
1.01M
            }
137
72.7k
        }
138
72.7k
    }
139
21.2k
    if (m_param->sourceHeight >= 1080)
140
0
        m_bHD = true;
141
142
21.2k
    return ok;
143
21.2k
}
144
145
void Analysis::destroy()
146
21.2k
{
147
94.0k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
148
72.7k
    {
149
72.7k
        m_modeDepth[i].cuMemPool.destroy();
150
72.7k
        m_modeDepth[i].fencYuv.destroy();
151
152
1.09M
        for (int j = 0; j < MAX_PRED_TYPES; j++)
153
1.01M
        {
154
1.01M
            m_modeDepth[i].pred[j].predYuv.destroy();
155
1.01M
            m_modeDepth[i].pred[j].reconYuv.destroy();
156
1.01M
        }
157
72.7k
    }
158
21.2k
    X265_FREE(cacheCost);
159
21.2k
}
160
161
void Analysis::computeMVForPUs(CUData& ctu, const CUGeom& cuGeom, int qp, Frame& frame)
162
0
{
163
0
    int areaId = 0;
164
0
    int finalIdx = 0;
165
166
0
    uint32_t depth = cuGeom.depth;
167
0
    uint32_t nextDepth = depth + 1;
168
169
0
    uint32_t cuSize = 1 << cuGeom.log2CUSize;
170
0
    bool mightSplit = (cuSize > m_param->minCUSize);
171
172
0
    uint32_t cuX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
173
0
    uint32_t cuY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
174
175
0
    if (cuSize != m_param->maxCUSize)
176
0
    {
177
0
        uint32_t subCUSize = m_param->maxCUSize / 2;
178
0
        areaId = (cuX >= subCUSize) + 2 * (cuY >= subCUSize) + 1;
179
0
    }
180
181
0
    if (mightSplit)
182
0
    {
183
0
        int nextQP = qp;
184
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
185
0
        {
186
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
187
0
            if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
188
0
                nextQP = setLambdaFromQP(ctu, calculateQpforCuSize(ctu, childGeom));
189
190
0
            computeMVForPUs(ctu, childGeom, nextQP, frame);
191
0
        }
192
0
    }
193
194
0
    ModeDepth& md = m_modeDepth[cuGeom.depth];
195
0
    CUData& cu = md.pred[PRED_2Nx2N].cu;
196
197
0
    for (int i = 0; i < MAX_NUM_PU_SIZES; i++)
198
0
    {
199
0
        const PUBlock& pu = g_puLookup[i];
200
0
        int startIdx = g_puStartIdx[pu.width + pu.height][static_cast<int>(pu.partsize)];
201
202
0
        if (pu.width > cuSize || pu.height > cuSize || (pu.width != cuSize && pu.height != cuSize))
203
0
            continue;
204
205
0
        if (!m_param->bEnableAMP && pu.isAmp)
206
0
            continue;
207
0
        if (!m_param->bEnableRectInter && pu.width != pu.height && !pu.isAmp)
208
0
            continue;
209
210
0
        int blockWidth = pu.isAmp ? X265_MAX(pu.width, pu.height) : pu.width;
211
0
        int blockHeight = pu.isAmp ? blockWidth : pu.height;
212
213
0
        int numColsCTU = m_param->maxCUSize / blockWidth;
214
0
        int numRowsCTU = m_param->maxCUSize / blockHeight;
215
216
0
        int puOffset = 0;
217
0
        if (pu.isAmp)
218
0
            puOffset = numRowsCTU * numColsCTU;
219
0
        else if (pu.partsize == SIZE_2NxN)
220
0
            puOffset = numColsCTU;
221
0
        else if (pu.partsize == SIZE_Nx2N)
222
0
            puOffset = 1;
223
224
0
        int col = (cuX - ctu.m_cuPelX) / blockWidth;
225
0
        int row = (cuY - ctu.m_cuPelY) / blockHeight;
226
227
0
        finalIdx = startIdx + row * numColsCTU + col;
228
229
0
        int subIdx =finalIdx - startIdx;
230
231
0
        int puRow = subIdx / numColsCTU;
232
0
        int puCol = subIdx % numColsCTU;
233
 
234
0
        int leftIdx = (puCol > 0) ? startIdx + puRow * numColsCTU + (puCol - 1) : -1;
235
0
        int aboveIdx = (puRow > 0) ? startIdx + (puRow - 1) * numColsCTU + puCol : -1;
236
0
        int aboveLeftIdx = (puRow > 0 && puCol > 0) ? startIdx + (puRow - 1) * numColsCTU + (puCol - 1) : -1;
237
0
        int aboveRightIdx = (puRow > 0 && puCol < numColsCTU - 1) ? startIdx + (puRow - 1) * numColsCTU + (puCol + 1) : -1;
238
239
0
        int neighborIdx[MD_ABOVE_LEFT + 1] = { leftIdx, aboveIdx, aboveRightIdx, -1, aboveLeftIdx};
240
241
0
        cu.initSubCU(ctu, cuGeom, qp);
242
0
        cu.setPartSizeSubParts(pu.partsize);
243
0
        setLambdaFromQP(cu, qp);
244
0
        puMotionEstimation(m_slice, cuGeom, cu, m_frame->m_fencPic, puOffset, pu.partsize, areaId, finalIdx, false, neighborIdx);
245
0
    }
246
0
}
247
248
void Analysis::deriveMVsForCTU(CUData& ctu, const CUGeom& cuGeom, Frame& frame)
249
0
{
250
0
    m_slice = ctu.m_slice;
251
0
    m_frame = &frame;
252
0
    m_param = m_frame->m_param;
253
254
0
    ModeDepth& md = m_modeDepth[0];
255
256
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
257
258
    // Full CTU
259
0
    int baseQP = setLambdaFromQP(ctu, ctu.m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : ctu.m_slice->m_sliceQp);
260
261
0
    md.pred[PRED_2Nx2N].cu.initSubCU(ctu, cuGeom, baseQP);
262
0
    md.pred[PRED_2Nx2N].cu.setPartSizeSubParts(SIZE_2Nx2N);
263
264
0
    puMotionEstimation(m_slice, cuGeom, md.pred[PRED_2Nx2N].cu, frame.m_fencPic, 0, SIZE_2Nx2N, 0, 0, true);
265
266
    // Sub-CUs
267
0
    if (m_param->maxCUSize != m_param->minCUSize)
268
0
    {
269
0
        for (int sub = 0; sub < 4; sub++)
270
0
        {
271
0
            ModeDepth& md1 = m_modeDepth[1];
272
273
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + sub);
274
0
            int qp = setLambdaFromQP(ctu, ctu.m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, childGeom) : ctu.m_slice->m_sliceQp);
275
276
0
            md1.pred[PRED_2Nx2N].cu.initSubCU(ctu, childGeom, qp);
277
0
            md1.pred[PRED_2Nx2N].cu.setPartSizeSubParts(SIZE_2Nx2N);
278
279
0
            puMotionEstimation(m_slice, childGeom, md1.pred[PRED_2Nx2N].cu, frame.m_fencPic, 0, SIZE_2Nx2N, sub + 1, 0, true);
280
0
        }
281
0
    }
282
283
0
    const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
284
0
    const CUData* colCU = colPic->m_encData->getPicCTU(ctu.m_cuAddr);
285
286
0
    for (int list = 0; list < numPredDir; list++)
287
0
    {
288
0
        int numRef = ctu.m_slice->m_numRefIdx[list];
289
290
0
        for (int ref = 0; ref < numRef; ref++)
291
0
        {
292
0
            MV medianMv;
293
0
            bool valid = ctu.getMedianColMV(colCU, colPic, list, ref, medianMv);
294
0
            if (!valid)
295
0
                continue;
296
297
0
            for (int areaIdx = 0; areaIdx < 5; areaIdx++)
298
0
            {
299
0
                m_areaBestMV[areaIdx][list][ref] = medianMv;
300
0
            }
301
0
        }
302
0
    }
303
304
0
    computeMVForPUs(ctu, cuGeom, baseQP, frame);
305
306
0
}
307
308
Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
309
13.7k
{
310
13.7k
    m_slice = ctu.m_slice;
311
13.7k
    m_frame = &frame;
312
13.7k
    m_bChromaSa8d = m_param->rdLevel >= 3;
313
13.7k
    m_param = m_frame->m_param;
314
315
#if _DEBUG || CHECKED_BUILD
316
    invalidateContexts(0);
317
#endif
318
319
13.7k
    int qp = setLambdaFromQP(ctu, m_slice->m_pps->bUseDQP ? calculateQpforCuSize(ctu, cuGeom) : m_slice->m_sliceQp);
320
13.7k
    ctu.setQPSubParts((int8_t)qp, 0, 0);
321
322
13.7k
    m_rqt[0].cur.load(initialContext);
323
13.7k
    ctu.m_meanQP = initialContext.m_meanQP;
324
13.7k
    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
325
326
13.7k
    if (m_param->bSsimRd)
327
0
        calculateNormFactor(ctu, qp);
328
329
13.7k
    uint32_t numPartition = ctu.m_numPartitions;
330
13.7k
    if (m_param->bCTUInfo && m_frame->m_ctuInfo && m_frame->m_ctuInfo[ctu.m_cuAddr])
331
0
    {
332
0
        x265_ctu_info_t* ctuTemp = m_frame->m_ctuInfo[ctu.m_cuAddr];
333
0
        int32_t depthIdx = 0;
334
0
        uint32_t maxNum8x8Partitions = 64;
335
0
        uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr];
336
0
        uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
337
0
        int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
338
0
        do
339
0
        {
340
0
            uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx];
341
0
            uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx));
342
0
            int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx];
343
0
            memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth);
344
0
            memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth);
345
0
            memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth);
346
0
            for (uint32_t l = 0; l < numPartition >> 2 * depth; l++)
347
0
                prevCtuInfoChangePtr[l] = prevCtuInfoChange;
348
0
            depthInfoPtr += ctu.m_numPartitions >> 2 * depth;
349
0
            contentInfoPtr += ctu.m_numPartitions >> 2 * depth;
350
0
            prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth;
351
0
            depthIdx++;
352
0
        } while (ctuTemp->ctuPartitions[depthIdx] != 0);
353
354
0
        m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr];
355
0
        m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr];
356
0
        memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition);
357
        //Calculate log2CUSize from depth
358
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
359
0
            ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
360
0
    }
361
13.7k
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
362
0
    {
363
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
364
0
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
365
0
        for (int dir = 0; dir < numPredDir; dir++)
366
0
        {
367
0
            m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
368
0
            m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
369
0
        }
370
0
        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
371
0
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
372
0
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
373
0
    }
374
    
375
13.7k
    int reuseLevel = X265_MAX(m_param->analysisSaveReuseLevel, m_param->analysisLoadReuseLevel);
376
13.7k
    if ((strlen(m_param->analysisSave) || strlen(m_param->analysisLoad)) && m_slice->m_sliceType != I_SLICE && reuseLevel > 1 && reuseLevel < 10)
377
0
    {
378
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
379
0
        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
380
0
        if (((m_param->analysisSaveReuseLevel > 1) && (m_param->analysisSaveReuseLevel < 7)) ||
381
0
            ((m_param->analysisLoadReuseLevel > 1) && (m_param->analysisLoadReuseLevel < 7)))
382
0
            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
383
0
        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
384
0
        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
385
0
        if (reuseLevel > 4)
386
0
        {
387
0
            m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
388
0
            m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
389
0
        }
390
0
        if (strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad))
391
0
            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
392
0
                m_reuseRef[i] = -1;
393
0
    }
394
13.7k
    ProfileCUScope(ctu, totalCTUTime, totalCTUs);
395
396
#if  ENABLE_SCC_EXT
397
    memset(m_ibc.m_BVs, 0, sizeof(m_ibc.m_BVs));
398
    memset(m_ibc.m_lastIntraBCMv, 0, sizeof(m_ibc.m_lastIntraBCMv));
399
    m_ibc.m_numBV16s = 0; m_ibc.m_numBVs = 0;
400
#endif
401
13.7k
    if (m_slice->m_sliceType == I_SLICE || (m_param->bEnableSCC && (m_slice->m_numRefIdx[0] == 1) && m_slice->m_refPOCList[0][0] == m_slice->m_poc))
402
13.7k
    {
403
13.7k
        x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
404
13.7k
        if (m_param->analysisLoadReuseLevel > 1)
405
0
        {
406
0
            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
407
0
            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
408
0
            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
409
0
            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
410
0
        }
411
#if ENABLE_SCC_EXT
412
        compressIntraCU(ctu, cuGeom, qp, &m_ibc);
413
#else
414
13.7k
        compressIntraCU(ctu, cuGeom, qp);
415
13.7k
#endif
416
13.7k
    }
417
18.4E
    else
418
18.4E
    {
419
18.4E
        bool bCopyAnalysis = ((m_param->analysisLoadReuseLevel == 10) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16));
420
18.4E
        bool bCompressInterCUrd0_4 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel <= 4);
421
18.4E
        bool bCompressInterCUrd5_6 = (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
422
18.4E
        bCopyAnalysis = bCopyAnalysis || bCompressInterCUrd0_4 || bCompressInterCUrd5_6;
423
424
18.4E
        if (bCopyAnalysis)
425
0
        {
426
0
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
427
0
            int posCTU = ctu.m_cuAddr * numPartition;
428
0
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
429
0
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
430
0
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
431
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
432
0
                memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);
433
434
0
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
435
0
            {
436
0
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
437
0
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
438
0
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
439
0
            }
440
            //Calculate log2CUSize from depth
441
0
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
442
0
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
443
0
        }
444
445
18.4E
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
446
0
            ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
447
0
            && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
448
0
            compressIntraCU(ctu, cuGeom, qp);
449
18.4E
        else if (!m_param->rdLevel)
450
0
        {
451
            /* In RD Level 0/1, copy source pixels into the reconstructed block so
452
             * they are available for intra predictions */
453
0
            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic[0], ctu.m_cuAddr, 0);
454
455
0
            compressInterCU_rd0_4(ctu, cuGeom, qp);
456
457
            /* generate residual for entire CTU at once and copy to reconPic */
458
0
            encodeResidue(ctu, cuGeom);
459
0
        }
460
18.4E
        else if ((m_param->analysisLoadReuseLevel == 10 && (!(m_param->bAnalysisType == HEVC_INFO) || m_slice->m_sliceType != P_SLICE)) ||
461
0
                ((m_param->bAnalysisType == AVC_INFO) && m_param->analysisLoadReuseLevel >= 7 && ctu.m_numPartitions <= 16))
462
0
        {
463
0
            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
464
0
            int posCTU = ctu.m_cuAddr * numPartition;
465
0
            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
466
0
            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
467
0
            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
468
0
            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bAnalysisType == AVC_INFO))
469
0
            {
470
0
                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
471
0
                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
472
0
                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
473
0
            }
474
            //Calculate log2CUSize from depth
475
0
            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
476
0
                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
477
478
0
            qprdRefine (ctu, cuGeom, qp, qp);
479
0
            return *m_modeDepth[0].bestMode;
480
0
        }
481
18.4E
        else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
482
0
            compressInterCU_dist(ctu, cuGeom, qp);
483
18.4E
        else if (m_param->rdLevel <= 4)
484
0
            compressInterCU_rd0_4(ctu, cuGeom, qp);
485
18.4E
        else
486
#if ENABLE_SCC_EXT
487
            compressInterCU_rd5_6(ctu, cuGeom, qp, &m_ibc);
488
#else
489
18.4E
            compressInterCU_rd5_6(ctu, cuGeom, qp);
490
18.4E
#endif
491
18.4E
    }
492
493
13.7k
    if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP)
494
0
        qprdRefine(ctu, cuGeom, qp, qp);
495
496
13.7k
    if (m_param->csvLogLevel >= 2)
497
0
        collectPUStatistics(ctu, cuGeom);
498
499
13.7k
    return *m_modeDepth[0].bestMode;
500
13.7k
}
501
502
void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom)
503
0
{
504
0
    uint8_t depth = 0;
505
0
    uint8_t partSize = 0;
506
0
    for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
507
0
    {
508
0
        depth = ctu.m_cuDepth[absPartIdx];
509
0
        partSize = ctu.m_partSize[absPartIdx];
510
0
        uint32_t numPU = nbPartsTable[(int)partSize];
511
0
        int shift = 2 * (m_param->maxCUDepth + 1 - depth);
512
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
513
0
        {
514
0
            PredictionUnit pu(ctu, cuGeom, puIdx);
515
0
            int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx);
516
0
            int mode = 1;
517
0
            if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN)
518
0
                mode = 2;
519
0
            else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N)
520
0
                 mode = 3;
521
0
            if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP)
522
0
            {
523
0
                ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift;
524
0
                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
525
0
            }
526
0
            else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA)
527
0
            {
528
0
                if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN)
529
0
                {
530
0
                    ctu.m_encData->m_frameStats.cnt4x4++;
531
0
                    ctu.m_encData->m_frameStats.totalPu[4]++;
532
0
                }
533
0
                else
534
0
                {
535
0
                    ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift;
536
0
                    ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
537
0
                }
538
0
            }
539
0
            else if (mode == 3)
540
0
            {
541
0
                ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift;
542
0
                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
543
0
                break;
544
0
            }
545
0
            else
546
0
            {
547
0
                if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx])
548
0
                    ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
549
0
                else
550
0
                    ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode;
551
552
0
                ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode;
553
0
            }
554
0
        }
555
0
    }
556
0
}
557
558
int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
559
0
{
560
0
    float predDepth = 0;
561
0
    CUData* neighbourCU;
562
0
    uint8_t count = 0;
563
0
    int32_t maxTUDepth = -1;
564
0
    neighbourCU = &m_slice->m_refFrameList[0][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
565
0
    predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
566
0
    count++;
567
0
    if (m_slice->isInterB())
568
0
    {
569
0
        neighbourCU = &m_slice->m_refFrameList[1][0]->m_encData->m_picCTU[parentCTU.m_cuAddr];
570
0
        predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
571
0
        count++;
572
0
    }
573
0
    if (parentCTU.m_cuAbove)
574
0
    {
575
0
        predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
576
0
        count++;
577
0
        if (parentCTU.m_cuAboveLeft)
578
0
        {
579
0
            predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
580
0
            count++;
581
0
        }
582
0
        if (parentCTU.m_cuAboveRight)
583
0
        {
584
0
            predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
585
0
            count++;
586
0
        }
587
0
    }
588
0
    if (parentCTU.m_cuLeft)
589
0
    {
590
0
        predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
591
0
        count++;
592
0
    }
593
0
    predDepth /= count;
594
595
0
    if (predDepth == 0)
596
0
        maxTUDepth = 0;
597
0
    else if (predDepth < 1)
598
0
        maxTUDepth = 1;
599
0
    else if (predDepth >= 1 && predDepth <= 1.5)
600
0
        maxTUDepth = 2;
601
0
    else if (predDepth > 1.5 && predDepth <= 2.5)
602
0
        maxTUDepth = 3;
603
0
    else
604
0
        maxTUDepth = -1;
605
606
0
    return maxTUDepth;
607
0
}
608
609
void Analysis::tryLossless(const CUGeom& cuGeom)
610
0
{
611
0
    ModeDepth& md = m_modeDepth[cuGeom.depth];
612
613
0
    if (!md.bestMode->distortion)
614
        /* already lossless */
615
0
        return;
616
0
    else if (md.bestMode->cu.isIntra(0))
617
0
    {
618
0
        md.pred[PRED_LOSSLESS].initCosts();
619
0
        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
620
0
        PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
621
0
        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
622
0
        checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
623
0
    }
624
0
    else
625
0
    {
626
0
        md.pred[PRED_LOSSLESS].initCosts();
627
0
        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
628
0
        md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
629
0
        encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
630
0
        checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
631
0
    }
632
0
}
633
634
void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
635
0
{
636
0
    uint32_t depth = cuGeom.depth;
637
0
    ModeDepth& md = m_modeDepth[depth];
638
0
    md.bestMode = NULL;
639
640
0
    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
641
642
0
    int bestCUQP = qp;
643
0
    int lambdaQP = lqp;
644
0
    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
645
0
    if (m_param->analysisLoadReuseLevel >= 7)
646
0
        doQPRefine = false;
647
0
    if (doQPRefine)
648
0
    {
649
0
        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
650
651
0
        int cuIdx = (cuGeom.childOffset - 1) / 3;
652
0
        bestCUCost = origCUCost = cacheCost[cuIdx];
653
654
0
        int direction = m_param->bOptCUDeltaQP ? 1 : 2;
655
656
0
        for (int dir = direction; dir >= -direction; dir -= (direction * 2))
657
0
        {
658
0
            if (m_param->bOptCUDeltaQP && ((dir != 1) || ((qp + 3) >= (int32_t)parentCTU.m_meanQP)))
659
0
                break;
660
661
0
            int threshold = 1;
662
0
            int failure = 0;
663
0
            cuPrevCost = origCUCost;
664
665
0
            int modCUQP = qp + dir;
666
0
            while (modCUQP >= m_param->rc.qpMin && modCUQP <= QP_MAX_SPEC)
667
0
            {
668
0
                if (m_param->bOptCUDeltaQP && modCUQP > (int32_t)parentCTU.m_meanQP)
669
0
                    break;
670
671
0
                recodeCU(parentCTU, cuGeom, modCUQP, qp);
672
0
                cuCost = md.bestMode->rdCost;
673
674
0
                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
675
0
                if (cuCost < cuPrevCost)
676
0
                    failure = 0;
677
0
                else
678
0
                    failure++;
679
680
0
                if (failure > threshold)
681
0
                    break;
682
683
0
                cuPrevCost = cuCost;
684
0
                modCUQP += dir;
685
0
            }
686
0
        }
687
0
        lambdaQP = bestCUQP;
688
0
    }
689
690
0
    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
691
692
    /* Copy best data to encData CTU and recon */
693
0
    md.bestMode->cu.copyToPic(depth);
694
0
    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], parentCTU.m_cuAddr, cuGeom.absPartIdx);
695
0
}
696
697
#if ENABLE_SCC_EXT
698
uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
699
#else
700
uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
701
#endif
702
400k
{
703
400k
    uint32_t depth = cuGeom.depth;
704
400k
    ModeDepth& md = m_modeDepth[depth];
705
400k
    md.bestMode = NULL;
706
707
400k
    MV iMVCandList[4][10];
708
400k
    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
709
710
400k
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
711
400k
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
712
713
400k
    bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
714
400k
    bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
715
400k
    int split = 0;
716
400k
    if (m_param->intraRefine && m_param->intraRefine != 4)
717
0
    {
718
0
        split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
719
0
            ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
720
0
        if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
721
0
            bAlreadyDecided = false;
722
0
    }
723
724
400k
    if (bAlreadyDecided)
725
0
    {
726
0
        if (bDecidedDepth && mightNotSplit)
727
0
        {
728
0
            Mode& mode = md.pred[0];
729
0
            md.bestMode = &mode;
730
0
            mode.cu.initSubCU(parentCTU, cuGeom, qp);
731
0
            bool reuseModes = !((m_param->intraRefine == 3) ||
732
0
                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
733
0
            if (reuseModes)
734
0
            {
735
0
                memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
736
0
                memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
737
0
            }
738
0
            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
739
740
0
            if (m_bTryLossless)
741
0
                tryLossless(cuGeom);
742
743
0
            if (mightSplit)
744
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
745
0
        }
746
0
    }
747
400k
    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
748
385k
    {
749
385k
        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
750
385k
        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
751
385k
        checkBestMode(md.pred[PRED_INTRA], depth);
752
753
385k
        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
754
298k
        {
755
298k
            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
756
298k
            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
757
298k
            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
758
298k
        }
759
760
#if ENABLE_SCC_EXT
761
        bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false;
762
        if (m_param->bEnableSCC)
763
        {
764
            md.pred[PRED_MERGE_IBC].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
765
            checkRDCostIntraBCMerge2Nx2N(md.pred[PRED_MERGE_IBC], cuGeom);
766
767
            md.pred[PRED_IBC_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
768
            checkIntraBC_rd5_6(md.pred[PRED_IBC_2Nx2N], cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
769
            checkBestMode(md.pred[PRED_IBC_2Nx2N], depth);
770
771
            if (intraBlockCopyFastSearch)
772
            {
773
                if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
774
                {
775
                    md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
776
                    checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
777
                    checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
778
779
                    md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
780
                    checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
781
                    checkBestMode(md.pred[PRED_IBC_2NxN], depth);
782
                }
783
            }
784
            else
785
            {
786
                md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
787
                checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
788
                checkBestMode(md.pred[PRED_IBC_2NxN], depth);
789
790
                md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
791
                checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
792
                checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
793
            }
794
        }
795
#endif
796
797
385k
        if (m_bTryLossless)
798
0
            tryLossless(cuGeom);
799
800
385k
        if (mightSplit)
801
86.6k
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
802
385k
    }
803
804
#if ENABLE_SCC_EXT
805
    // If Intra BC keep last coded Mv
806
    if (md.bestMode && md.bestMode->cu.isInter(0))
807
    {
808
        MVField mvField;
809
        const CUData* cu = &md.bestMode->cu;
810
        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
811
        int iRefIdxFirst = mvField.refIdx;
812
        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
813
        int iRefIdxLast = mvField.refIdx;
814
        bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxFirst]->m_poc == cu->m_slice->m_poc : false;
815
        bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxLast]->m_poc == cu->m_slice->m_poc : false;
816
817
        if (isIntraBCFirst || isIntraBCLast)
818
        {
819
            if (cu->m_partSize[0] == SIZE_2Nx2N)
820
            {
821
                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
822
                if (mvField.mv != cu->m_lastIntraBCMv[0])
823
                {
824
                    md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
825
                    md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
826
                }
827
            }
828
            else if (cu->m_partSize[0] == SIZE_2NxN || cu->m_partSize[0] == SIZE_Nx2N)
829
            {
830
                // mixed PU, only one partition is IntraBC coded
831
                if (isIntraBCFirst != isIntraBCLast)
832
                {
833
                    if (isIntraBCFirst)
834
                    {
835
                        // Part 0
836
                        md.bestMode->cu.getMvField(cu, 0, 0, mvField);
837
                        if (mvField.mv != cu->m_lastIntraBCMv[0])
838
                        {
839
                            md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
840
                            md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
841
                        }
842
                    }
843
                    else if (isIntraBCLast)
844
                    {
845
                        // Part 1
846
                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
847
                        if (mvField.mv != cu->m_lastIntraBCMv[0])
848
                        {
849
                            md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
850
                            md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
851
                        }
852
                    }
853
                }
854
                else // normal IntraBC CU
855
                {
856
                    // Part 0
857
                    md.bestMode->cu.getMvField(cu, 0, 0, mvField);
858
                    if (mvField.mv != cu->m_lastIntraBCMv[0])
859
                    {
860
                        md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
861
                        md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
862
                    }
863
                    // Part 1
864
                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
865
                    if (mvField.mv != cu->m_lastIntraBCMv[0])
866
                    {
867
                        md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
868
                        md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
869
                    }
870
                }
871
            }
872
            else
873
            {
874
                // NxN
875
                for (int part = 0; part < 4; part++)
876
                {
877
                    md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
878
                    if (mvField.mv != cu->m_lastIntraBCMv[0])
879
                    {
880
                        md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
881
                        md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
882
                    }
883
                }
884
            }
885
        }
886
    } // is inter
887
#endif
888
889
    // stop recursion if we reach the depth of previous analysis decision
890
400k
    mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
891
892
400k
    if (mightSplit)
893
102k
    {
894
102k
        Mode* splitPred = &md.pred[PRED_SPLIT];
895
102k
        splitPred->initCosts();
896
102k
        CUData* splitCU = &splitPred->cu;
897
102k
        splitCU->initSubCU(parentCTU, cuGeom, qp);
898
899
102k
        uint32_t nextDepth = depth + 1;
900
102k
        ModeDepth& nd = m_modeDepth[nextDepth];
901
102k
        invalidateContexts(nextDepth);
902
102k
        Entropy* nextContext = &m_rqt[depth].cur;
903
102k
        int32_t nextQP = qp;
904
102k
        uint64_t curCost = 0;
905
102k
        int skipSplitCheck = 0;
906
907
510k
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
908
408k
        {
909
408k
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
910
408k
            if (childGeom.flags & CUGeom::PRESENT)
911
386k
            {
912
386k
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
913
386k
                m_rqt[nextDepth].cur.load(*nextContext);
914
915
386k
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
916
12.7k
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
917
918
386k
                if (m_param->bEnableSplitRdSkip)
919
0
                {
920
#if ENABLE_SCC_EXT
921
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP, ibc);
922
#else
923
0
                    curCost += compressIntraCU(parentCTU, childGeom, nextQP);
924
0
#endif
925
0
                    if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
926
0
                    {
927
0
                        skipSplitCheck = 1;
928
0
                        break;
929
0
                    }
930
0
                }
931
386k
                else
932
933
386k
#if !ENABLE_SCC_EXT
934
386k
                    compressIntraCU(parentCTU, childGeom, nextQP);
935
#else
936
                    compressIntraCU(parentCTU, childGeom, nextQP, ibc);
937
938
                if (nd.bestMode->cu.m_lastIntraBCMv[0].x != 0 || nd.bestMode->cu.m_lastIntraBCMv[0].y != 0)
939
                {
940
                    for (int i = 0; i < 2; i++)
941
                    {
942
                        ibc->m_lastIntraBCMv[i] = nd.bestMode->cu.m_lastIntraBCMv[i];
943
                    }
944
                }
945
#endif
946
947
                // Save best CU and pred data for this sub CU
948
386k
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
949
386k
                splitPred->addSubCosts(*nd.bestMode);
950
386k
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
951
386k
                nextContext = &nd.bestMode->contexts;
952
386k
            }
953
21.5k
            else
954
21.5k
            {
955
                /* record the depth of this non-present sub-CU */
956
21.5k
                splitCU->setEmptyPart(childGeom, subPartIdx);
957
958
                /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
959
21.5k
                if (bAlreadyDecided)
960
0
                    memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
961
21.5k
            }
962
408k
        }
963
102k
        if (!skipSplitCheck)
964
102k
        {
965
102k
            nextContext->store(splitPred->contexts);
966
102k
            if (mightNotSplit)
967
89.7k
                addSplitFlagCost(*splitPred, cuGeom.depth);
968
12.3k
            else
969
12.3k
                updateModeCost(*splitPred);
970
971
102k
            checkDQPForSplitPred(*splitPred, cuGeom);
972
102k
            checkBestMode(*splitPred, depth);
973
102k
        }
974
102k
    }
975
976
400k
    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
977
0
    {
978
0
        int cuIdx = (cuGeom.childOffset - 1) / 3;
979
0
        cacheCost[cuIdx] = md.bestMode->rdCost;
980
0
    }
981
982
400k
    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
983
0
    {
984
0
        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
985
0
        int8_t maxTUDepth = -1;
986
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
987
0
            maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
988
0
        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
989
0
    }
990
991
    /* Copy best data to encData CTU and recon */
992
400k
    md.bestMode->cu.copyToPic(depth);
993
400k
    if (md.bestMode != &md.pred[PRED_SPLIT])
994
384k
    {
995
769k
        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
996
384k
            md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[i], parentCTU.m_cuAddr, cuGeom.absPartIdx);
997
384k
    }
998
999
400k
    return md.bestMode->rdCost;
1000
400k
}
1001
1002
void Analysis::PMODE::processTasks(int workerThreadId)
1003
0
{
1004
#if DETAILED_CU_STATS
1005
    int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
1006
    master.m_stats[fe].countPModeTasks++;
1007
    ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
1008
#endif
1009
0
    ProfileScopeEvent(pmode);
1010
0
    master.processPmode(*this, master.m_tld[workerThreadId].analysis);
1011
0
}
1012
1013
/* process pmode jobs until none remain; may be called by the master thread or by
1014
 * a bonded peer (slave) thread via pmodeTasks() */
1015
void Analysis::processPmode(PMODE& pmode, Analysis& slave)
1016
0
{
1017
    /* acquire a mode task, else exit early */
1018
0
    int task;
1019
0
    pmode.m_lock.acquire();
1020
0
    if (pmode.m_jobTotal > pmode.m_jobAcquired)
1021
0
    {
1022
0
        task = pmode.m_jobAcquired++;
1023
0
        pmode.m_lock.release();
1024
0
    }
1025
0
    else
1026
0
    {
1027
0
        pmode.m_lock.release();
1028
0
        return;
1029
0
    }
1030
1031
0
    ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
1032
1033
    /* setup slave Analysis */
1034
0
    if (&slave != this)
1035
0
    {
1036
0
        slave.m_slice = m_slice;
1037
0
        slave.m_frame = m_frame;
1038
0
        slave.m_param = m_param;
1039
0
        slave.m_bChromaSa8d = m_param->rdLevel >= 3;
1040
0
        slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
1041
0
        slave.invalidateContexts(0);
1042
0
        slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
1043
0
    }
1044
1045
    /* perform Mode task, repeat until no more work is available */
1046
0
    do
1047
0
    {
1048
0
        uint32_t refMasks[2] = { 0, 0 };
1049
1050
0
        if (m_param->rdLevel <= 4)
1051
0
        {
1052
0
            switch (pmode.modes[task])
1053
0
            {
1054
0
            case PRED_INTRA:
1055
0
                slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
1056
0
                if (m_param->rdLevel > 2)
1057
0
                    slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
1058
0
                break;
1059
1060
0
            case PRED_2Nx2N:
1061
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
1062
1063
0
                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
1064
0
                if (m_slice->m_sliceType == B_SLICE)
1065
0
                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
1066
0
                break;
1067
1068
0
            case PRED_Nx2N:
1069
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
1070
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
1071
1072
0
                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
1073
0
                break;
1074
1075
0
            case PRED_2NxN:
1076
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
1077
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
1078
1079
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
1080
0
                break;
1081
1082
0
            case PRED_2NxnU:
1083
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
1084
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
1085
1086
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
1087
0
                break;
1088
1089
0
            case PRED_2NxnD:
1090
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
1091
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
1092
1093
0
                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
1094
0
                break;
1095
1096
0
            case PRED_nLx2N:
1097
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
1098
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
1099
1100
0
                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
1101
0
                break;
1102
1103
0
            case PRED_nRx2N:
1104
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
1105
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
1106
1107
0
                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
1108
0
                break;
1109
1110
0
            default:
1111
0
                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
1112
0
                break;
1113
0
            }
1114
0
        }
1115
0
        else
1116
0
        {
1117
0
            switch (pmode.modes[task])
1118
0
            {
1119
0
            case PRED_INTRA:
1120
0
                slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N);
1121
0
                if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1122
0
                    slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN);
1123
0
                break;
1124
1125
0
            case PRED_2Nx2N:
1126
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
1127
1128
0
                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
1129
0
                md.pred[PRED_BIDIR].rdCost = MAX_INT64;
1130
0
                if (m_slice->m_sliceType == B_SLICE)
1131
0
                {
1132
0
                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
1133
0
                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1134
0
                        slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
1135
0
                }
1136
0
                break;
1137
1138
0
            case PRED_Nx2N:
1139
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
1140
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
1141
1142
0
                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
1143
0
                break;
1144
1145
0
            case PRED_2NxN:
1146
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
1147
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
1148
1149
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
1150
0
                break;
1151
1152
0
            case PRED_2NxnU:
1153
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
1154
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
1155
1156
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
1157
0
                break;
1158
1159
0
            case PRED_2NxnD:
1160
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
1161
0
                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
1162
0
                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
1163
0
                break;
1164
1165
0
            case PRED_nLx2N:
1166
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
1167
0
                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
1168
1169
0
                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
1170
0
                break;
1171
1172
0
            case PRED_nRx2N:
1173
0
                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
1174
0
                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
1175
0
                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
1176
0
                break;
1177
1178
0
            default:
1179
0
                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
1180
0
                break;
1181
0
            }
1182
0
        }
1183
1184
0
        task = -1;
1185
0
        pmode.m_lock.acquire();
1186
0
        if (pmode.m_jobTotal > pmode.m_jobAcquired)
1187
0
            task = pmode.m_jobAcquired++;
1188
0
        pmode.m_lock.release();
1189
0
    }
1190
0
    while (task >= 0);
1191
0
}
1192
1193
uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1194
0
{
1195
0
    uint32_t depth = cuGeom.depth;
1196
0
    uint32_t cuAddr = parentCTU.m_cuAddr;
1197
0
    ModeDepth& md = m_modeDepth[depth];
1198
0
    md.bestMode = NULL;
1199
1200
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1201
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1202
0
    uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
1203
0
    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
1204
1205
0
    X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
1206
1207
0
    PMODE pmode(*this, cuGeom);
1208
1209
0
    if (mightNotSplit && depth >= minDepth)
1210
0
    {
1211
        /* Initialize all prediction CUs based on parentCTU */
1212
0
        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1213
0
        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1214
1215
0
        if (m_param->rdLevel <= 4)
1216
0
            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1217
0
        else
1218
0
            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1219
0
    }
1220
1221
0
    bool bNoSplit = false;
1222
0
    bool splitIntra = true;
1223
0
    if (md.bestMode)
1224
0
    {
1225
0
        bNoSplit = md.bestMode->cu.isSkipped(0);
1226
0
        if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
1227
0
            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1228
0
    }
1229
1230
0
    if (mightSplit && !bNoSplit)
1231
0
    {
1232
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
1233
0
        splitPred->initCosts();
1234
0
        CUData* splitCU = &splitPred->cu;
1235
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
1236
1237
0
        uint32_t nextDepth = depth + 1;
1238
0
        ModeDepth& nd = m_modeDepth[nextDepth];
1239
0
        invalidateContexts(nextDepth);
1240
0
        Entropy* nextContext = &m_rqt[depth].cur;
1241
0
        int nextQP = qp;
1242
0
        splitIntra = false;
1243
1244
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1245
0
        {
1246
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1247
0
            if (childGeom.flags & CUGeom::PRESENT)
1248
0
            {
1249
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1250
0
                m_rqt[nextDepth].cur.load(*nextContext);
1251
1252
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1253
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1254
1255
0
                splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
1256
1257
                // Save best CU and pred data for this sub CU
1258
0
                splitIntra |= nd.bestMode->cu.isIntra(0);
1259
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1260
0
                splitPred->addSubCosts(*nd.bestMode);
1261
1262
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1263
0
                nextContext = &nd.bestMode->contexts;
1264
0
            }
1265
0
            else
1266
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
1267
0
        }
1268
0
        nextContext->store(splitPred->contexts);
1269
1270
0
        if (mightNotSplit)
1271
0
            addSplitFlagCost(*splitPred, cuGeom.depth);
1272
0
        else
1273
0
            updateModeCost(*splitPred);
1274
1275
0
        checkDQPForSplitPred(*splitPred, cuGeom);
1276
0
    }
1277
1278
0
    if (mightNotSplit && depth >= minDepth)
1279
0
    {
1280
0
        int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
1281
0
        int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE);
1282
1283
0
        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1284
0
            setLambdaFromQP(parentCTU, qp);
1285
1286
0
        if (bTryIntra)
1287
0
        {
1288
0
            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1289
0
            if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
1290
0
                md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1291
0
            pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
1292
0
        }
1293
0
        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
1294
0
        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1295
0
        if (m_param->bEnableRectInter)
1296
0
        {
1297
0
            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
1298
0
            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
1299
0
        }
1300
0
        if (bTryAmp)
1301
0
        {
1302
0
            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
1303
0
            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
1304
0
            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
1305
0
            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
1306
0
        }
1307
1308
0
        m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
1309
1310
0
        pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
1311
1312
        /* participate in processing jobs, until all are distributed */
1313
0
        processPmode(pmode, *this);
1314
1315
        /* the master worker thread (this one) does merge analysis. By doing
1316
         * merge after all the other jobs are at least started, we usually avoid
1317
         * blocking on another thread */
1318
1319
0
        if (m_param->rdLevel <= 4)
1320
0
        {
1321
0
            {
1322
0
                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1323
0
                pmode.waitForExit();
1324
0
            }
1325
1326
            /* select best inter mode based on sa8d cost */
1327
0
            Mode *bestInter = &md.pred[PRED_2Nx2N];
1328
1329
0
            if (m_param->bEnableRectInter)
1330
0
            {
1331
0
                if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1332
0
                    bestInter = &md.pred[PRED_Nx2N];
1333
0
                if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1334
0
                    bestInter = &md.pred[PRED_2NxN];
1335
0
            }
1336
1337
0
            if (bTryAmp)
1338
0
            {
1339
0
                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1340
0
                    bestInter = &md.pred[PRED_2NxnU];
1341
0
                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1342
0
                    bestInter = &md.pred[PRED_2NxnD];
1343
0
                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1344
0
                    bestInter = &md.pred[PRED_nLx2N];
1345
0
                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1346
0
                    bestInter = &md.pred[PRED_nRx2N];
1347
0
            }
1348
1349
0
            if (m_param->rdLevel > 2)
1350
0
            {
1351
                /* RD selection between merge, inter, bidir and intra */
1352
0
                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1353
0
                {
1354
0
                    uint32_t numPU = bestInter->cu.getNumPartInter(0);
1355
0
                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1356
0
                    {
1357
0
                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1358
0
                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1359
0
                    }
1360
0
                }
1361
0
                encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1362
0
                checkBestMode(*bestInter, depth);
1363
1364
                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1365
0
                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1366
0
                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1367
0
                {
1368
0
                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1369
0
                    checkBestMode(md.pred[PRED_BIDIR], depth);
1370
0
                }
1371
1372
0
                if (bTryIntra)
1373
0
                    checkBestMode(md.pred[PRED_INTRA], depth);
1374
0
            }
1375
0
            else /* m_param->rdLevel == 2 */
1376
0
            {
1377
0
                if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1378
0
                    md.bestMode = bestInter;
1379
1380
0
                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1381
0
                    md.bestMode = &md.pred[PRED_BIDIR];
1382
1383
0
                if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1384
0
                {
1385
0
                    md.bestMode = &md.pred[PRED_INTRA];
1386
0
                    encodeIntraInInter(*md.bestMode, cuGeom);
1387
0
                }
1388
0
                else if (!md.bestMode->cu.m_mergeFlag[0])
1389
0
                {
1390
                    /* finally code the best mode selected from SA8D costs */
1391
0
                    uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1392
0
                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1393
0
                    {
1394
0
                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1395
0
                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
1396
0
                    }
1397
0
                    encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1398
0
                }
1399
0
            }
1400
0
        }
1401
0
        else
1402
0
        {
1403
0
            {
1404
0
                ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
1405
0
                pmode.waitForExit();
1406
0
            }
1407
1408
0
            checkBestMode(md.pred[PRED_2Nx2N], depth);
1409
0
            if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1410
0
                checkBestMode(md.pred[PRED_BIDIR], depth);
1411
1412
0
            if (m_param->bEnableRectInter)
1413
0
            {
1414
0
                checkBestMode(md.pred[PRED_Nx2N], depth);
1415
0
                checkBestMode(md.pred[PRED_2NxN], depth);
1416
0
            }
1417
1418
0
            if (bTryAmp)
1419
0
            {
1420
0
                checkBestMode(md.pred[PRED_2NxnU], depth);
1421
0
                checkBestMode(md.pred[PRED_2NxnD], depth);
1422
0
                checkBestMode(md.pred[PRED_nLx2N], depth);
1423
0
                checkBestMode(md.pred[PRED_nRx2N], depth);
1424
0
            }
1425
1426
0
            if (bTryIntra)
1427
0
            {
1428
0
                checkBestMode(md.pred[PRED_INTRA], depth);
1429
0
                if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1430
0
                    checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1431
0
            }
1432
0
        }
1433
1434
0
        if (m_bTryLossless)
1435
0
            tryLossless(cuGeom);
1436
1437
0
        if (mightSplit)
1438
0
            addSplitFlagCost(*md.bestMode, cuGeom.depth);
1439
0
    }
1440
1441
    /* compare split RD cost against best cost */
1442
0
    if (mightSplit && !bNoSplit)
1443
0
        checkBestMode(md.pred[PRED_SPLIT], depth);
1444
1445
    /* determine which motion references the parent CU should search */
1446
0
    uint32_t refMask;
1447
0
    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1448
0
        refMask = 0;
1449
0
    else if (md.bestMode == &md.pred[PRED_SPLIT])
1450
0
        refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1451
0
    else
1452
0
    {
1453
        /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1454
0
        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1455
0
        uint32_t numPU = cu.getNumPartInter(0);
1456
0
        refMask = 0;
1457
0
        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1458
0
            refMask |= cu.getBestRefIdx(subPartIdx);
1459
0
    }
1460
1461
0
    if (mightNotSplit)
1462
0
    {
1463
        /* early-out statistics */
1464
0
        FrameData& curEncData = *m_frame->m_encData;
1465
0
        FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1466
0
        uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1467
0
        cuStat.count[depth] += 1;
1468
0
        cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1469
0
    }
1470
1471
    /* Copy best data to encData CTU and recon */
1472
0
    md.bestMode->cu.copyToPic(depth);
1473
0
    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], cuAddr, cuGeom.absPartIdx);
1474
1475
0
    return refMask;
1476
0
}
1477
1478
SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
1479
0
{
1480
0
    if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))
1481
0
        return compressInterCU_rd5_6(parentCTU, cuGeom, qp);
1482
1483
0
    uint32_t depth = cuGeom.depth;
1484
0
    uint32_t cuAddr = parentCTU.m_cuAddr;
1485
0
    ModeDepth& md = m_modeDepth[depth];
1486
1487
1488
0
    if (m_param->searchMethod == X265_SEA)
1489
0
    {
1490
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
1491
0
        int offset = (int)(m_frame->m_reconPic[0]->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic[0]->m_buOffsetY[cuGeom.absPartIdx]);
1492
0
        for (int list = 0; list < numPredDir; list++)
1493
0
            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
1494
0
                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
1495
0
                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
1496
0
    }
1497
1498
0
    PicYuv& reconPic = *m_frame->m_reconPic[0];
1499
0
    SplitData splitCUData;
1500
1501
0
    bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
1502
0
    bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
1503
0
    bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
1504
1505
0
    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
1506
0
    {
1507
0
        md.bestMode = NULL;
1508
0
        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1509
0
        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1510
0
        uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
1511
0
        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1512
0
        bool skipModes = false; /* Skip any remaining mode analyses at current depth */
1513
0
        bool skipRecursion = false; /* Skip recursion */
1514
0
        bool splitIntra = true;
1515
0
        bool skipRectAmp = false;
1516
0
        bool chooseMerge = false;
1517
0
        bool bCtuInfoCheck = false;
1518
0
        int sameContentRef = 0;
1519
1520
0
        if (m_evaluateInter)
1521
0
        {
1522
0
            if (m_refineLevel == 2)
1523
0
            {
1524
0
                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
1525
0
                    skipModes = true;
1526
0
                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1527
0
                    skipRectAmp = true;
1528
0
            }
1529
0
            mightSplit &= false;
1530
0
            minDepth = depth;
1531
0
        }
1532
1533
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
1534
0
            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
1535
1536
0
        SplitData splitData[4];
1537
0
        splitData[0].initSplitCUData();
1538
0
        splitData[1].initSplitCUData();
1539
0
        splitData[2].initSplitCUData();
1540
0
        splitData[3].initSplitCUData();
1541
1542
        // avoid uninitialize value in below reference
1543
0
        if (m_param->limitModes)
1544
0
        {
1545
0
            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
1546
0
            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
1547
0
            md.pred[PRED_2Nx2N].sa8dCost = 0;
1548
0
        }
1549
1550
0
        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
1551
0
        {
1552
0
            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
1553
0
                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
1554
0
            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
1555
0
            {
1556
0
                mightNotSplit &= bDecidedDepth;
1557
0
                bCtuInfoCheck = skipRecursion = false;
1558
0
                skipModes = true;
1559
0
            }
1560
0
            else if (mightNotSplit && bDecidedDepth)
1561
0
            {
1562
0
                if (m_additionalCtuInfo[cuGeom.absPartIdx])
1563
0
                {
1564
0
                    bCtuInfoCheck = skipRecursion = true;
1565
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1566
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1567
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1568
0
                    if (!sameContentRef)
1569
0
                    {
1570
0
                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
1571
0
                        {
1572
0
                            qp -= int32_t(0.04 * qp);
1573
0
                            setLambdaFromQP(parentCTU, qp);
1574
0
                        }
1575
0
                        if (m_param->bCTUInfo & 4)
1576
0
                            skipModes = false;
1577
0
                    }
1578
0
                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
1579
0
                    {
1580
0
                        if (m_param->rdLevel)
1581
0
                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1582
0
                        if ((m_param->bCTUInfo & 4) && sameContentRef)
1583
0
                            skipModes = md.bestMode && true;
1584
0
                    }
1585
0
                }
1586
0
                else
1587
0
                {
1588
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1589
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1590
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1591
0
                    if (m_param->rdLevel)
1592
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
1593
0
                }
1594
0
                mightSplit &= !bDecidedDepth;
1595
0
            }
1596
0
        }
1597
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10))
1598
0
        {
1599
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1600
0
            {
1601
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1602
0
                {
1603
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1604
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1605
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1606
1607
0
                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1608
0
                    if (m_param->rdLevel)
1609
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
1610
0
                }
1611
0
                if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
1612
0
                {
1613
0
                    if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
1614
0
                    {
1615
0
                        skipRectAmp = true && !!md.bestMode;
1616
0
                        chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
1617
0
                    }
1618
0
                }
1619
0
            }
1620
0
        }
1621
0
        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
1622
0
        {
1623
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
1624
0
            {
1625
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
1626
0
                {
1627
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1628
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1629
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1630
1631
0
                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
1632
0
                    if (m_param->rdLevel)
1633
0
                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
1634
0
                }
1635
0
            }
1636
0
        }
1637
        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
1638
0
        if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1639
            /* TODO: Re-evaluate if analysis load/save still works */
1640
0
        {
1641
            /* Compute Merge Cost */
1642
0
            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
1643
0
            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
1644
0
            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1645
0
            if (m_param->rdLevel)
1646
0
                skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
1647
0
                && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
1648
0
        }
1649
0
        if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
1650
0
        {
1651
0
            skipRecursion = md.bestMode->cu.isSkipped(0);
1652
0
            if (mightSplit && !skipRecursion)
1653
0
            {
1654
0
                if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
1655
0
                {
1656
0
                    if (depth)
1657
0
                        skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
1658
0
                    if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
1659
0
                        skipRecursion = complexityCheckCU(*md.bestMode);
1660
0
                }
1661
0
                else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
1662
0
                {
1663
0
                    skipRecursion = complexityCheckCU(*md.bestMode);
1664
0
                }
1665
1666
0
            }
1667
0
        }
1668
0
        if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
1669
0
            skipRecursion = true;
1670
        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
1671
0
        if (mightSplit && !skipRecursion)
1672
0
        {
1673
0
            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
1674
0
                qp = int((1 / 0.96) * qp + 0.5);
1675
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
1676
0
            splitPred->initCosts();
1677
0
            CUData* splitCU = &splitPred->cu;
1678
0
            splitCU->initSubCU(parentCTU, cuGeom, qp);
1679
1680
0
            uint32_t nextDepth = depth + 1;
1681
0
            ModeDepth& nd = m_modeDepth[nextDepth];
1682
0
            invalidateContexts(nextDepth);
1683
0
            Entropy* nextContext = &m_rqt[depth].cur;
1684
0
            int nextQP = qp;
1685
0
            splitIntra = false;
1686
1687
0
            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1688
0
            {
1689
0
                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1690
0
                if (childGeom.flags & CUGeom::PRESENT)
1691
0
                {
1692
0
                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1693
0
                    m_rqt[nextDepth].cur.load(*nextContext);
1694
1695
0
                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1696
0
                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1697
1698
0
                    splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
1699
1700
                    // Save best CU and pred data for this sub CU
1701
0
                    splitIntra |= nd.bestMode->cu.isIntra(0);
1702
0
                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1703
0
                    splitPred->addSubCosts(*nd.bestMode);
1704
1705
0
                    if (m_param->rdLevel)
1706
0
                        nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1707
0
                    else
1708
0
                        nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
1709
0
                    if (m_param->rdLevel > 1)
1710
0
                        nextContext = &nd.bestMode->contexts;
1711
0
                }
1712
0
                else
1713
0
                    splitCU->setEmptyPart(childGeom, subPartIdx);
1714
0
            }
1715
0
            nextContext->store(splitPred->contexts);
1716
1717
0
            if (mightNotSplit)
1718
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
1719
0
            else if (m_param->rdLevel > 1)
1720
0
                updateModeCost(*splitPred);
1721
0
            else
1722
0
                splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
1723
0
        }
1724
        /* If analysis mode is simple do not Evaluate other modes */
1725
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
1726
0
        {
1727
0
            if (m_slice->m_sliceType == P_SLICE)
1728
0
            {
1729
0
                if (m_checkMergeAndSkipOnly[0])
1730
0
                    skipModes = true;
1731
0
            }
1732
0
            else
1733
0
            {
1734
0
                if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
1735
0
                    skipModes = true;
1736
0
            }
1737
0
        }
1738
        /* Split CUs
1739
         *   0  1
1740
         *   2  3 */
1741
0
        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1742
        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1743
0
        if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
1744
0
        {
1745
0
            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
1746
0
                setLambdaFromQP(parentCTU, qp);
1747
1748
0
            if (!skipModes)
1749
0
            {
1750
0
                uint32_t refMasks[2];
1751
0
                refMasks[0] = allSplitRefs;
1752
0
                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1753
0
                checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
1754
1755
0
                if (m_param->limitReferences & X265_REF_LIMIT_CU)
1756
0
                {
1757
0
                    CUData& cu = md.pred[PRED_2Nx2N].cu;
1758
0
                    uint32_t refMask = cu.getBestRefIdx(0);
1759
0
                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
1760
0
                }
1761
1762
0
                if (m_slice->m_sliceType == B_SLICE)
1763
0
                {
1764
0
                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
1765
0
                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1766
0
                }
1767
1768
0
                Mode *bestInter = &md.pred[PRED_2Nx2N];
1769
0
                if (!skipRectAmp)
1770
0
                {
1771
0
                    if (m_param->bEnableRectInter)
1772
0
                    {
1773
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1774
0
                        uint32_t threshold_2NxN, threshold_Nx2N;
1775
1776
0
                        if (m_slice->m_sliceType == P_SLICE)
1777
0
                        {
1778
0
                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1779
0
                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1780
0
                        }
1781
0
                        else
1782
0
                        {
1783
0
                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1784
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1785
0
                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1786
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1787
0
                        }
1788
1789
0
                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
1790
0
                        if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1791
0
                        {
1792
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1793
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1794
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1795
0
                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1796
0
                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1797
0
                                bestInter = &md.pred[PRED_2NxN];
1798
0
                        }
1799
1800
0
                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
1801
0
                        {
1802
0
                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
1803
0
                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
1804
0
                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1805
0
                            checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1806
0
                            if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
1807
0
                                bestInter = &md.pred[PRED_Nx2N];
1808
0
                        }
1809
1810
0
                        if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
1811
0
                        {
1812
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1813
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1814
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1815
0
                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1816
0
                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
1817
0
                                bestInter = &md.pred[PRED_2NxN];
1818
0
                        }
1819
0
                    }
1820
1821
0
                    if (m_slice->m_sps->maxAMPDepth > depth)
1822
0
                    {
1823
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1824
0
                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
1825
1826
0
                        if (m_slice->m_sliceType == P_SLICE)
1827
0
                        {
1828
0
                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1829
0
                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
1830
1831
0
                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1832
0
                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
1833
0
                        }
1834
0
                        else
1835
0
                        {
1836
0
                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
1837
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1838
0
                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
1839
0
                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1840
1841
0
                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
1842
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1843
0
                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
1844
0
                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1845
0
                        }
1846
1847
0
                        bool bHor = false, bVer = false;
1848
0
                        if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
1849
0
                            bHor = true;
1850
0
                        else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
1851
0
                            bVer = true;
1852
0
                        else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
1853
0
                            md.bestMode && md.bestMode->cu.getQtRootCbf(0))
1854
0
                        {
1855
0
                            bHor = true;
1856
0
                            bVer = true;
1857
0
                        }
1858
1859
0
                        if (bHor)
1860
0
                        {
1861
0
                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1862
0
                            if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1863
0
                            {
1864
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
1865
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1866
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1867
0
                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1868
0
                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1869
0
                                    bestInter = &md.pred[PRED_2NxnD];
1870
0
                            }
1871
1872
0
                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
1873
0
                            {
1874
0
                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1875
0
                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
1876
0
                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1877
0
                                checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1878
0
                                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
1879
0
                                    bestInter = &md.pred[PRED_2NxnU];
1880
0
                            }
1881
1882
0
                            if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
1883
0
                            {
1884
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
1885
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1886
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1887
0
                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1888
0
                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
1889
0
                                    bestInter = &md.pred[PRED_2NxnD];
1890
0
                            }
1891
0
                        }
1892
0
                        if (bVer)
1893
0
                        {
1894
0
                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1895
0
                            if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1896
0
                            {
1897
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
1898
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1899
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1900
0
                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1901
0
                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1902
0
                                    bestInter = &md.pred[PRED_nRx2N];
1903
0
                            }
1904
1905
0
                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
1906
0
                            {
1907
0
                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
1908
0
                                refMasks[1] = allSplitRefs;                                    /* 75% right */
1909
0
                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1910
0
                                checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1911
0
                                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
1912
0
                                    bestInter = &md.pred[PRED_nLx2N];
1913
0
                            }
1914
1915
0
                            if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
1916
0
                            {
1917
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
1918
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1919
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1920
0
                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1921
0
                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
1922
0
                                    bestInter = &md.pred[PRED_nRx2N];
1923
0
                            }
1924
0
                        }
1925
0
                    }
1926
0
                }
1927
0
                bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
1928
0
                if (m_param->rdLevel >= 3)
1929
0
                {
1930
                    /* Calculate RD cost of best inter option */
1931
0
                    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
1932
0
                    {
1933
0
                        uint32_t numPU = bestInter->cu.getNumPartInter(0);
1934
0
                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1935
0
                        {
1936
0
                            PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
1937
0
                            motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
1938
0
                        }
1939
0
                    }
1940
1941
0
                    if (!chooseMerge)
1942
0
                    {
1943
0
                        encodeResAndCalcRdInterCU(*bestInter, cuGeom);
1944
0
                        checkBestMode(*bestInter, depth);
1945
1946
                        /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
1947
0
                        if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
1948
0
                            md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
1949
0
                        {
1950
0
                            uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
1951
0
                            if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
1952
0
                                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1953
0
                                {
1954
0
                                    PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
1955
0
                                    motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
1956
0
                                }
1957
0
                            encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1958
0
                            checkBestMode(md.pred[PRED_BIDIR], depth);
1959
0
                        }
1960
0
                    }
1961
1962
0
                    if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
1963
0
                        md.bestMode->sa8dCost == MAX_INT64)
1964
0
                    {
1965
0
                        if (!m_param->limitReferences || splitIntra)
1966
0
                        {
1967
0
                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1968
0
                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1969
0
                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1970
0
                            encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
1971
0
                            checkBestMode(md.pred[PRED_INTRA], depth);
1972
0
                        }
1973
0
                        else
1974
0
                        {
1975
0
                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1976
0
                        }
1977
0
                    }
1978
0
                }
1979
0
                else
1980
0
                {
1981
                    /* SA8D choice between merge/skip, inter, bidir, and intra */
1982
0
                    if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
1983
0
                        md.bestMode = bestInter;
1984
1985
0
                    if (m_slice->m_sliceType == B_SLICE &&
1986
0
                        md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
1987
0
                        md.bestMode = &md.pred[PRED_BIDIR];
1988
1989
0
                    if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
1990
0
                    {
1991
0
                        if (!m_param->limitReferences || splitIntra)
1992
0
                        {
1993
0
                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1994
0
                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1995
0
                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
1996
0
                            if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
1997
0
                                md.bestMode = &md.pred[PRED_INTRA];
1998
0
                        }
1999
0
                        else
2000
0
                        {
2001
0
                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
2002
0
                        }
2003
0
                    }
2004
2005
                    /* finally code the best mode selected by SA8D costs:
2006
                     * RD level 2 - fully encode the best mode
2007
                     * RD level 1 - generate recon pixels
2008
                     * RD level 0 - generate chroma prediction */
2009
0
                    if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
2010
0
                    {
2011
                        /* prediction already generated for this CU, and if rd level
2012
                         * is not 0, it is already fully encoded */
2013
0
                    }
2014
0
                    else if (md.bestMode->cu.isInter(0))
2015
0
                    {
2016
0
                        uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
2017
0
                        if (m_csp != X265_CSP_I400)
2018
0
                        {
2019
0
                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2020
0
                            {
2021
0
                                PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
2022
0
                                motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
2023
0
                            }
2024
0
                        }
2025
0
                        if (m_param->rdLevel == 2)
2026
0
                            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
2027
0
                        else if (m_param->rdLevel == 1)
2028
0
                        {
2029
                            /* generate recon pixels with no rate distortion considerations */
2030
0
                            CUData& cu = md.bestMode->cu;
2031
2032
0
                            uint32_t tuDepthRange[2];
2033
0
                            cu.getInterTUQtDepthRange(tuDepthRange, 0);
2034
0
                            m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
2035
0
                            residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
2036
0
                            if (cu.getQtRootCbf(0))
2037
0
                                md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
2038
0
                            else
2039
0
                            {
2040
0
                                md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
2041
0
                                if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
2042
0
                                    cu.setPredModeSubParts(MODE_SKIP);
2043
0
                            }
2044
0
                        }
2045
0
                    }
2046
0
                    else
2047
0
                    {
2048
0
                        if (m_param->rdLevel == 2)
2049
0
                            encodeIntraInInter(*md.bestMode, cuGeom);
2050
0
                        else if (m_param->rdLevel == 1)
2051
0
                        {
2052
                            /* generate recon pixels with no rate distortion considerations */
2053
0
                            CUData& cu = md.bestMode->cu;
2054
2055
0
                            uint32_t tuDepthRange[2];
2056
0
                            cu.getIntraTUQtDepthRange(tuDepthRange, 0);
2057
2058
0
                            residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
2059
0
                            if (m_csp != X265_CSP_I400)
2060
0
                            {
2061
0
                                getBestIntraModeChroma(*md.bestMode, cuGeom);
2062
0
                                residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
2063
0
                            }
2064
0
                            md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
2065
0
                        }
2066
0
                    }
2067
0
                }
2068
0
            } // !earlyskip
2069
2070
0
            if (m_bTryLossless)
2071
0
                tryLossless(cuGeom);
2072
2073
0
            if (mightSplit)
2074
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
2075
0
        }
2076
2077
0
        if (mightSplit && !skipRecursion)
2078
0
        {
2079
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
2080
0
            if (!md.bestMode)
2081
0
                md.bestMode = splitPred;
2082
0
            else if (m_param->rdLevel > 1)
2083
0
                checkBestMode(*splitPred, cuGeom.depth);
2084
0
            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
2085
0
                md.bestMode = splitPred;
2086
2087
0
            checkDQPForSplitPred(*md.bestMode, cuGeom);
2088
0
        }
2089
2090
        /* determine which motion references the parent CU should search */
2091
0
        splitCUData.initSplitCUData();
2092
2093
0
        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2094
0
        {
2095
0
            if (md.bestMode == &md.pred[PRED_SPLIT])
2096
0
                splitCUData.splitRefs = allSplitRefs;
2097
0
            else
2098
0
            {
2099
                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2100
0
                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2101
0
                uint32_t numPU = cu.getNumPartInter(0);
2102
0
                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2103
0
                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2104
0
            }
2105
0
        }
2106
2107
0
        if (m_param->limitModes)
2108
0
        {
2109
0
            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2110
0
            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2111
0
            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
2112
0
        }
2113
2114
0
        if (mightNotSplit && md.bestMode->cu.isSkipped(0))
2115
0
        {
2116
0
            FrameData& curEncData = *m_frame->m_encData;
2117
0
            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
2118
0
            uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
2119
0
            cuStat.count[depth] += 1;
2120
0
            cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
2121
0
        }
2122
2123
        /* Copy best data to encData CTU and recon */
2124
0
        md.bestMode->cu.copyToPic(depth);
2125
0
        if (m_param->rdLevel)
2126
0
            md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
2127
2128
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2129
0
        {
2130
0
            if (mightNotSplit)
2131
0
            {
2132
0
                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
2133
0
                int8_t maxTUDepth = -1;
2134
0
                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
2135
0
                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
2136
0
                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
2137
0
            }
2138
0
        }
2139
0
    }
2140
0
    else
2141
0
    {
2142
0
        if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
2143
0
        {
2144
0
            qprdRefine(parentCTU, cuGeom, qp, qp);
2145
2146
0
            SplitData splitData[4];
2147
0
            splitData[0].initSplitCUData();
2148
0
            splitData[1].initSplitCUData();
2149
0
            splitData[2].initSplitCUData();
2150
0
            splitData[3].initSplitCUData();
2151
2152
0
            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2153
2154
0
            splitCUData.initSplitCUData();
2155
2156
0
            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2157
0
            {
2158
0
                if (md.bestMode == &md.pred[PRED_SPLIT])
2159
0
                    splitCUData.splitRefs = allSplitRefs;
2160
0
                else
2161
0
                {
2162
                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2163
0
                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2164
0
                    uint32_t numPU = cu.getNumPartInter(0);
2165
0
                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2166
0
                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2167
0
                }
2168
0
            }
2169
2170
0
            if (m_param->limitModes)
2171
0
            {
2172
0
                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2173
0
                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2174
0
                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
2175
0
            }
2176
0
        }
2177
0
    }
2178
2179
0
    return splitCUData;
2180
0
}
2181
2182
#if ENABLE_SCC_EXT
2183
SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, IBC* ibc)
2184
#else
2185
SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
2186
#endif
2187
0
{
2188
0
    if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
2189
0
        return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
2190
2191
0
    uint32_t depth = cuGeom.depth;
2192
0
    ModeDepth& md = m_modeDepth[depth];
2193
0
    md.bestMode = NULL;
2194
2195
0
    Mode* interBest = NULL; // store the best modes in inter prediction
2196
2197
0
    MV iMVCandList[4][10];
2198
0
    memset(iMVCandList, 0, sizeof(MV) * 4 * 10);
2199
2200
0
    if (m_param->searchMethod == X265_SEA)
2201
0
    {
2202
0
        int numPredDir = m_slice->isInterP() ? 1 : 2;
2203
0
        int offset = (int)(m_frame->m_reconPic[0]->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic[0]->m_buOffsetY[cuGeom.absPartIdx]);
2204
0
        for (int list = 0; list < numPredDir; list++)
2205
0
            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
2206
0
                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2207
0
                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
2208
0
    }
2209
2210
0
    SplitData splitCUData;
2211
2212
0
    bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);
2213
0
    bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
2214
0
    bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);
2215
2216
0
    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
2217
0
    {
2218
0
        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2219
0
        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2220
0
        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2221
0
        bool skipRecursion = false;
2222
0
        bool skipModes = false;
2223
0
        bool splitIntra = true;
2224
0
        bool skipRectAmp = false;
2225
0
        bool bCtuInfoCheck = false;
2226
0
        int sameContentRef = 0;
2227
2228
0
        if (m_evaluateInter)
2229
0
        {
2230
0
            if (m_refineLevel == 2)
2231
0
            {
2232
0
                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
2233
0
                    skipModes = true;
2234
0
                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
2235
0
                    skipRectAmp = true;
2236
0
            }
2237
0
            mightSplit &= false;
2238
0
        }
2239
2240
        // avoid uninitialize value in below reference
2241
0
        if (m_param->limitModes)
2242
0
        {
2243
0
            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
2244
0
            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
2245
0
            md.pred[PRED_2Nx2N].rdCost = 0;
2246
0
        }
2247
2248
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2249
0
            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
2250
2251
0
        SplitData splitData[4];
2252
0
        splitData[0].initSplitCUData();
2253
0
        splitData[1].initSplitCUData();
2254
0
        splitData[2].initSplitCUData();
2255
0
        splitData[3].initSplitCUData();
2256
0
        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2257
0
        uint32_t refMasks[2];
2258
0
        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
2259
0
        {
2260
0
            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
2261
0
                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
2262
0
            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
2263
0
            {
2264
0
                mightNotSplit &= bDecidedDepth;
2265
0
                bCtuInfoCheck = skipRecursion = false;
2266
0
                skipModes = true;
2267
0
            }
2268
0
            else if (mightNotSplit && bDecidedDepth)
2269
0
            {
2270
0
                if (m_additionalCtuInfo[cuGeom.absPartIdx])
2271
0
                {
2272
0
                    bCtuInfoCheck = skipRecursion = true;
2273
0
                    refMasks[0] = allSplitRefs;
2274
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2275
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2276
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2277
0
                    if (!sameContentRef)
2278
0
                    {
2279
0
                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
2280
0
                        {
2281
0
                            qp -= int32_t(0.04 * qp);
2282
0
                            setLambdaFromQP(parentCTU, qp);
2283
0
                        }
2284
0
                        if (m_param->bCTUInfo & 4)
2285
0
                            skipModes = false;
2286
0
                    }
2287
0
                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
2288
0
                    {
2289
0
                        if (m_param->rdLevel)
2290
0
                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
2291
0
                        if ((m_param->bCTUInfo & 4) && sameContentRef)
2292
0
                            skipModes = md.bestMode && true;
2293
0
                    }
2294
0
                }
2295
0
                else
2296
0
                {
2297
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2298
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2299
0
                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2300
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2301
0
                    refMasks[0] = allSplitRefs;
2302
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2303
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2304
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2305
#if ENABLE_SCC_EXT
2306
                    interBest = md.bestMode;
2307
#endif
2308
0
                }
2309
0
                mightSplit &= !bDecidedDepth;
2310
0
            }
2311
0
        }
2312
0
        if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2313
0
        {
2314
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
2315
0
            {
2316
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
2317
0
                {
2318
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2319
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2320
0
                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2321
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2322
0
                    refMasks[0] = allSplitRefs;
2323
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2324
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2325
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2326
2327
0
                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2328
0
                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2329
#if ENABLE_SCC_EXT
2330
                    interBest = md.bestMode;
2331
#endif
2332
0
                }
2333
0
                if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
2334
0
                    skipRectAmp = true && !!md.bestMode;
2335
0
            }
2336
0
        }
2337
2338
0
        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
2339
0
        {
2340
0
            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
2341
0
            {
2342
0
                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
2343
0
                {
2344
0
                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2345
0
                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2346
0
                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2347
2348
0
                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
2349
0
                    refMasks[0] = allSplitRefs;
2350
0
                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2351
0
                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2352
0
                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2353
2354
0
                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
2355
0
                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2356
#if ENABLE_SCC_EXT
2357
                    interBest = md.bestMode;
2358
#endif
2359
0
                }
2360
0
            }
2361
0
        }
2362
        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
2363
0
        if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) ||
2364
0
            (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
2365
0
        {
2366
0
            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
2367
0
            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
2368
0
            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
2369
0
            skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
2370
0
                md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2371
0
            refMasks[0] = allSplitRefs;
2372
0
            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2373
0
            checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
2374
0
            checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
2375
2376
#if ENABLE_SCC_EXT
2377
            interBest = md.bestMode;
2378
            if (m_param->bEnableSCC)
2379
            {
2380
                md.pred[PRED_MERGE_IBC].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2381
                checkRDCostIntraBCMerge2Nx2N(md.pred[PRED_MERGE_IBC], cuGeom);
2382
            }
2383
#endif
2384
2385
0
            if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
2386
0
                skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
2387
0
            else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
2388
0
                skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
2389
0
        }
2390
0
        if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
2391
0
            skipRecursion = true;
2392
        // estimate split cost
2393
        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
2394
0
        if (mightSplit && !skipRecursion)
2395
0
        {
2396
0
            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
2397
0
                qp = int((1 / 0.96) * qp + 0.5);
2398
0
            Mode* splitPred = &md.pred[PRED_SPLIT];
2399
0
            splitPred->initCosts();
2400
0
            CUData* splitCU = &splitPred->cu;
2401
0
            splitCU->initSubCU(parentCTU, cuGeom, qp);
2402
2403
0
            uint32_t nextDepth = depth + 1;
2404
0
            ModeDepth& nd = m_modeDepth[nextDepth];
2405
0
            invalidateContexts(nextDepth);
2406
0
            Entropy* nextContext = &m_rqt[depth].cur;
2407
0
            int nextQP = qp;
2408
0
            splitIntra = false;
2409
2410
0
            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
2411
0
            {
2412
0
                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
2413
0
                if (childGeom.flags & CUGeom::PRESENT)
2414
0
                {
2415
0
                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
2416
0
                    m_rqt[nextDepth].cur.load(*nextContext);
2417
2418
0
                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
2419
0
                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
2420
2421
2422
#if ENABLE_SCC_EXT
2423
                    splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP, ibc);
2424
2425
                    if (nd.bestMode->cu.m_lastIntraBCMv[0].x != 0 || nd.bestMode->cu.m_lastIntraBCMv[0].y != 0)
2426
                    {
2427
                        for (int i = 0; i < 2; i++)
2428
                            ibc->m_lastIntraBCMv[i] = nd.bestMode->cu.m_lastIntraBCMv[i];
2429
                    }
2430
#else
2431
0
                    splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
2432
0
#endif
2433
2434
                    // Save best CU and pred data for this sub CU
2435
0
                    splitIntra |= nd.bestMode->cu.isIntra(0);
2436
0
                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
2437
0
                    splitPred->addSubCosts(*nd.bestMode);
2438
0
                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
2439
0
                    nextContext = &nd.bestMode->contexts;
2440
0
                }
2441
0
                else
2442
0
                {
2443
0
                    splitCU->setEmptyPart(childGeom, subPartIdx);
2444
0
                }
2445
0
            }
2446
0
            nextContext->store(splitPred->contexts);
2447
0
            if (mightNotSplit)
2448
0
                addSplitFlagCost(*splitPred, cuGeom.depth);
2449
0
            else
2450
0
                updateModeCost(*splitPred);
2451
2452
0
            checkDQPForSplitPred(*splitPred, cuGeom);
2453
0
        }
2454
        /* If analysis mode is simple do not Evaluate other modes */
2455
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
2456
0
        {
2457
0
            if (m_slice->m_sliceType == P_SLICE)
2458
0
            {
2459
0
                if (m_checkMergeAndSkipOnly[0])
2460
0
                    skipModes = true;
2461
0
            }
2462
0
            else
2463
0
            {
2464
0
                if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])
2465
0
                    skipModes = true;
2466
0
            }
2467
0
        }
2468
        /* Split CUs
2469
         *   0  1
2470
         *   2  3 */
2471
0
        allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2472
        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
2473
0
        if (mightNotSplit)
2474
0
        {
2475
0
            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
2476
0
                setLambdaFromQP(parentCTU, qp);
2477
2478
0
            if (!skipModes)
2479
0
            {
2480
0
                refMasks[0] = allSplitRefs;
2481
2482
0
                if (m_param->limitReferences & X265_REF_LIMIT_CU)
2483
0
                {
2484
0
                    CUData& cu = md.pred[PRED_2Nx2N].cu;
2485
0
                    uint32_t refMask = cu.getBestRefIdx(0);
2486
0
                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
2487
0
                }
2488
2489
0
                if (m_slice->m_sliceType == B_SLICE)
2490
0
                {
2491
0
                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
2492
0
                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
2493
0
                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
2494
0
                    {
2495
0
                        uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
2496
0
                        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
2497
0
                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2498
0
                            {
2499
0
                                PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
2500
0
                                motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
2501
0
                            }
2502
0
                        encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
2503
0
                        checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
2504
0
                    }
2505
0
                }
2506
2507
0
                if (!skipRectAmp)
2508
0
                {
2509
0
                    if (m_param->bEnableRectInter)
2510
0
                    {
2511
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2512
0
                        uint32_t threshold_2NxN, threshold_Nx2N;
2513
2514
0
                        if (m_slice->m_sliceType == P_SLICE)
2515
0
                        {
2516
0
                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2517
0
                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2518
0
                        }
2519
0
                        else
2520
0
                        {
2521
0
                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2522
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2523
0
                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2524
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2525
0
                        }
2526
2527
0
                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
2528
0
                        if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2529
0
                        {
2530
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2531
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2532
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2533
#if ENABLE_SCC_EXT
2534
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks, iMVCandList[SIZE_2NxN]);
2535
                            interBest = (md.pred[PRED_2NxN].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxN] : interBest;
2536
#else
2537
0
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2538
0
#endif
2539
0
                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2540
0
                        }
2541
2542
0
                        if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
2543
0
                        {
2544
0
                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
2545
0
                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
2546
0
                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2547
#if ENABLE_SCC_EXT
2548
                            checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks, iMVCandList[SIZE_Nx2N]);
2549
                            interBest = (md.pred[PRED_Nx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_Nx2N] : interBest;
2550
#else
2551
0
                            checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
2552
0
#endif
2553
0
                            checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
2554
0
                        }
2555
2556
0
                        if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
2557
0
                        {
2558
0
                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
2559
0
                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
2560
0
                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2561
#if ENABLE_SCC_EXT
2562
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks, iMVCandList[SIZE_2NxN]);
2563
                            interBest = (md.pred[PRED_2NxN].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxN] : interBest;
2564
#else
2565
0
                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
2566
0
#endif
2567
0
                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
2568
0
                        }
2569
0
                    }
2570
2571
                    // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
2572
0
                    if (m_slice->m_sps->maxAMPDepth > depth)
2573
0
                    {
2574
0
                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
2575
0
                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
2576
2577
0
                        if (m_slice->m_sliceType == P_SLICE)
2578
0
                        {
2579
0
                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
2580
0
                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
2581
2582
0
                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
2583
0
                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
2584
0
                        }
2585
0
                        else
2586
0
                        {
2587
0
                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
2588
0
                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
2589
0
                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
2590
0
                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2591
2592
0
                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
2593
0
                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
2594
0
                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
2595
0
                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
2596
0
                        }
2597
2598
0
                        bool bHor = false, bVer = false;
2599
0
                        if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
2600
0
                            bHor = true;
2601
0
                        else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
2602
0
                            bVer = true;
2603
0
                        else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
2604
0
                        {
2605
0
                            bHor = true;
2606
0
                            bVer = true;
2607
0
                        }
2608
2609
0
                        if (bHor)
2610
0
                        {
2611
0
                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
2612
0
                            if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2613
0
                            {
2614
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
2615
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2616
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2617
0
                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2618
0
                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2619
#if ENABLE_SCC_EXT
2620
                                interBest = (md.pred[PRED_2NxnD].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnD] : interBest;
2621
#endif
2622
0
                            }
2623
2624
0
                            if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
2625
0
                            {
2626
0
                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
2627
0
                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
2628
0
                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
2629
0
                                checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
2630
0
                                checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
2631
#if ENABLE_SCC_EXT
2632
                                interBest = (md.pred[PRED_2NxnU].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnU] : interBest;
2633
#endif
2634
0
                            }
2635
2636
0
                            if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
2637
0
                            {
2638
0
                                refMasks[0] = allSplitRefs;                                    /* 75% top */
2639
0
                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
2640
0
                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
2641
0
                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
2642
0
                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
2643
#if ENABLE_SCC_EXT
2644
                                interBest = (md.pred[PRED_2NxnD].rdCost < interBest->rdCost) ? &md.pred[PRED_2NxnD] : interBest;
2645
#endif
2646
0
                            }
2647
0
                        }
2648
2649
0
                        if (bVer)
2650
0
                        {
2651
0
                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
2652
0
                            if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2653
0
                            {
2654
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
2655
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2656
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2657
0
                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2658
0
                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2659
#if ENABLE_SCC_EXT
2660
                                interBest = (md.pred[PRED_nRx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nRx2N] : interBest;
2661
#endif
2662
0
                            }
2663
2664
0
                            if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
2665
0
                            {
2666
0
                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
2667
0
                                refMasks[1] = allSplitRefs;                                    /* 75% right */
2668
0
                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2669
0
                                checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
2670
0
                                checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
2671
#if ENABLE_SCC_EXT
2672
                                interBest = (md.pred[PRED_nLx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nLx2N] : interBest;
2673
#endif
2674
0
                            }
2675
2676
0
                            if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
2677
0
                            {
2678
0
                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
2679
0
                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
2680
0
                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2681
0
                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
2682
0
                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
2683
#if ENABLE_SCC_EXT
2684
                                interBest = (md.pred[PRED_nRx2N].rdCost < interBest->rdCost) ? &md.pred[PRED_nRx2N] : interBest;
2685
#endif
2686
0
                            }
2687
0
                        }
2688
0
                    }
2689
0
                }
2690
2691
#if ENABLE_SCC_EXT
2692
                if (m_param->bEnableSCC)
2693
                {
2694
                    bool intraBlockCopyFastSearch = (m_param->bEnableSCC == 1) ? true : false, bUse1DSearchFor8x8 = false, bValid;
2695
                    md.pred[PRED_IBC_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2696
                    checkIntraBC_rd5_6(md.pred[PRED_IBC_2Nx2N], cuGeom, SIZE_2Nx2N, false, bUse1DSearchFor8x8, *ibc);
2697
                    checkBestMode(md.pred[PRED_IBC_2Nx2N], depth);
2698
2699
                    if (intraBlockCopyFastSearch)
2700
                    {
2701
                        if ((int)depth == m_slice->m_sps->log2DiffMaxMinCodingBlockSize)
2702
                        {
2703
                            md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2704
                            checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
2705
                            checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
2706
2707
                            md.pred[PRED_MIXED_IBC_NX2N].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2708
                            bValid = predMixedIntraBCInterSearch(md.pred[PRED_MIXED_IBC_NX2N], cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, SIZE_Nx2N, iMVCandList[SIZE_Nx2N]);
2709
                            if (bValid)
2710
                                encodeResAndCalcRdInterCU(md.pred[PRED_MIXED_IBC_NX2N], cuGeom);
2711
                            else
2712
                                md.pred[PRED_MIXED_IBC_NX2N].rdCost = UINT64_MAX;
2713
                            checkBestMode(md.pred[PRED_MIXED_IBC_NX2N], depth);
2714
2715
                            md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2716
                            checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
2717
                            checkBestMode(md.pred[PRED_IBC_2NxN], depth);
2718
2719
                            md.pred[PRED_MIXED_IBC_2NXN].cu.initSubCU(parentCTU, cuGeom, qp, ibc->m_lastIntraBCMv);
2720
                            bValid = predMixedIntraBCInterSearch(md.pred[PRED_MIXED_IBC_2NXN], cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, SIZE_2NxN, iMVCandList[SIZE_2NxN]);
2721
                            if (bValid)
2722
                                encodeResAndCalcRdInterCU(md.pred[PRED_MIXED_IBC_2NXN], cuGeom);
2723
                            else
2724
                                md.pred[PRED_MIXED_IBC_2NXN].rdCost = UINT64_MAX;
2725
                            checkBestMode(md.pred[PRED_MIXED_IBC_2NXN], depth);
2726
                        }
2727
                    }
2728
                    else // full search
2729
                    {
2730
                        md.pred[PRED_IBC_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2731
                        checkIntraBC_rd5_6(md.pred[PRED_IBC_2NxN], cuGeom, SIZE_2NxN, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_2NxN] + 8));
2732
                        checkBestMode(md.pred[PRED_IBC_2NxN], depth);
2733
2734
                        md.pred[PRED_IBC_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
2735
                        checkIntraBC_rd5_6(md.pred[PRED_IBC_Nx2N], cuGeom, SIZE_Nx2N, false, bUse1DSearchFor8x8, *ibc, (iMVCandList[SIZE_Nx2N] + 8));
2736
                        checkBestMode(md.pred[PRED_IBC_Nx2N], depth);
2737
                    }
2738
                }
2739
#endif
2740
2741
0
                if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
2742
0
                {
2743
0
                    if (!m_param->limitReferences || splitIntra)
2744
0
                    {
2745
0
                        ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
2746
0
                        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
2747
0
                        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
2748
0
                        checkBestMode(md.pred[PRED_INTRA], depth);
2749
2750
0
                        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
2751
0
                        {
2752
0
                            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
2753
0
                            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
2754
0
                            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
2755
0
                        }
2756
0
                    }
2757
0
                    else
2758
0
                    {
2759
0
                        ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
2760
0
                    }
2761
0
                }
2762
0
            }
2763
2764
#if ENABLE_SCC_EXT
2765
            // If Intra BC keep last coded Mv
2766
            if (md.bestMode->cu.isInter(0))
2767
            {
2768
                MVField mvField;
2769
                const CUData* cu = &md.bestMode->cu;
2770
                md.bestMode->cu.getMvField(cu, 0, 0, mvField);
2771
                int iRefIdxFirst = mvField.refIdx;
2772
                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2773
                int iRefIdxLast = mvField.refIdx;
2774
                bool isIntraBCFirst = (iRefIdxFirst >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxFirst]->m_poc == cu->m_slice->m_poc : false;
2775
                bool isIntraBCLast = (iRefIdxLast >= 0) ? cu->m_slice->m_refFrameList[0][iRefIdxLast]->m_poc == cu->m_slice->m_poc : false;
2776
2777
                if (isIntraBCFirst || isIntraBCLast)
2778
                {
2779
                    if (cu->m_partSize[0] == SIZE_2Nx2N)
2780
                    {
2781
                        md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2782
                        if (mvField.mv != cu->m_lastIntraBCMv[0])
2783
                        {
2784
                            md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2785
                            md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2786
                        }
2787
                    }
2788
                    else if (cu->m_partSize[0] == SIZE_2NxN || cu->m_partSize[0] == SIZE_Nx2N)
2789
                    {
2790
                        // mixed PU, only one partition is IntraBC coded
2791
                        if (isIntraBCFirst != isIntraBCLast)
2792
                        {
2793
                            if (isIntraBCFirst)
2794
                            {
2795
                                // Part 0
2796
                                md.bestMode->cu.getMvField(cu, 0, 0, mvField);
2797
                                if (mvField.mv != cu->m_lastIntraBCMv[0])
2798
                                {
2799
                                    md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2800
                                    md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2801
                                }
2802
                            }
2803
                            else if (isIntraBCLast)
2804
                            {
2805
                                // Part 1
2806
                                md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2807
                                if (mvField.mv != cu->m_lastIntraBCMv[0])
2808
                                {
2809
                                    md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2810
                                    md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2811
                                }
2812
                            }
2813
                        }
2814
                        else // normal IntraBC CU
2815
                        {
2816
                            // Part 0
2817
                            md.bestMode->cu.getMvField(cu, 0, 0, mvField);
2818
                            if (mvField.mv != cu->m_lastIntraBCMv[0])
2819
                            {
2820
                                md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2821
                                md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2822
                            }
2823
                            // Part 1
2824
                            md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 1, 0, mvField);
2825
                            if (mvField.mv != cu->m_lastIntraBCMv[0])
2826
                            {
2827
                                md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2828
                                md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2829
                            }
2830
                        }
2831
                    }
2832
                    else
2833
                    {
2834
                        // NxN
2835
                        for (int part = 0; part < 4; part++)
2836
                        {
2837
                            md.bestMode->cu.getMvField(cu, cu->m_numPartitions - 4 + part, 0, mvField);
2838
                            if (mvField.mv != cu->m_lastIntraBCMv[0])
2839
                            {
2840
                                md.bestMode->cu.m_lastIntraBCMv[1] = cu->m_lastIntraBCMv[0];
2841
                                md.bestMode->cu.m_lastIntraBCMv[0] = mvField.mv;
2842
                            }
2843
                        }
2844
                    }
2845
                }
2846
            } // is inter
2847
#endif
2848
2849
0
            if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
2850
0
            {
2851
0
                uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
2852
2853
0
                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
2854
0
                {
2855
0
                    PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
2856
0
                    motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
2857
0
                }
2858
0
                encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
2859
0
            }
2860
0
            if (m_bTryLossless)
2861
0
                tryLossless(cuGeom);
2862
2863
0
            if (mightSplit)
2864
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
2865
0
        }
2866
2867
0
        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
2868
0
        {
2869
0
            if (mightNotSplit)
2870
0
            {
2871
0
                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
2872
0
                int8_t maxTUDepth = -1;
2873
0
                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
2874
0
                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
2875
0
                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
2876
0
            }
2877
0
        }
2878
2879
        /* compare split RD cost against best cost */
2880
0
        if (mightSplit && !skipRecursion)
2881
0
            checkBestMode(md.pred[PRED_SPLIT], depth);
2882
2883
0
        if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
2884
0
        {
2885
0
            int cuIdx = (cuGeom.childOffset - 1) / 3;
2886
0
            cacheCost[cuIdx] = md.bestMode->rdCost;
2887
0
        }
2888
2889
        /* determine which motion references the parent CU should search */
2890
0
        splitCUData.initSplitCUData();
2891
0
        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2892
0
        {
2893
0
            if (md.bestMode == &md.pred[PRED_SPLIT])
2894
0
                splitCUData.splitRefs = allSplitRefs;
2895
0
            else
2896
0
            {
2897
                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2898
0
                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : (m_param->bEnableSCC ? interBest->cu : md.bestMode->cu);
2899
0
                uint32_t numPU = cu.getNumPartInter(0);
2900
0
                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2901
0
                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2902
0
            }
2903
0
        }
2904
2905
0
        if (m_param->limitModes)
2906
0
        {
2907
0
            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2908
0
            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2909
0
            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2910
0
        }
2911
2912
        /* Copy best data to encData CTU and recon */
2913
0
        md.bestMode->cu.copyToPic(depth);
2914
0
        for (int i = 0; i < !!m_param->bEnableSCC + 1; i++)
2915
0
            md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[i], parentCTU.m_cuAddr, cuGeom.absPartIdx);
2916
0
    }
2917
0
    else
2918
0
    {
2919
0
        if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16)
2920
0
        {
2921
0
            qprdRefine(parentCTU, cuGeom, qp, qp);
2922
2923
0
            SplitData splitData[4];
2924
0
            splitData[0].initSplitCUData();
2925
0
            splitData[1].initSplitCUData();
2926
0
            splitData[2].initSplitCUData();
2927
0
            splitData[3].initSplitCUData();
2928
2929
0
            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
2930
2931
0
            splitCUData.initSplitCUData();
2932
0
            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
2933
0
            {
2934
0
                if (md.bestMode == &md.pred[PRED_SPLIT])
2935
0
                    splitCUData.splitRefs = allSplitRefs;
2936
0
                else
2937
0
                {
2938
                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
2939
0
                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
2940
0
                    uint32_t numPU = cu.getNumPartInter(0);
2941
0
                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
2942
0
                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
2943
0
                }
2944
0
            }
2945
2946
0
            if (m_param->limitModes)
2947
0
            {
2948
0
                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
2949
0
                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
2950
0
                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
2951
0
            }
2952
0
        }
2953
0
    }
2954
2955
0
    return splitCUData;
2956
0
}
2957
2958
void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
2959
0
{
2960
0
    uint32_t depth = cuGeom.depth;
2961
0
    ModeDepth& md = m_modeDepth[depth];
2962
0
    md.bestMode = NULL;
2963
2964
0
    m_evaluateInter = 0;
2965
0
    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
2966
0
    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
2967
0
    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
2968
0
    int split = 0;
2969
2970
0
    TrainingData td;
2971
0
    td.init(parentCTU, cuGeom);
2972
2973
0
    if (!m_param->bDynamicRefine)
2974
0
        m_refineLevel = m_param->interRefine;
2975
0
    else
2976
0
        m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
2977
2978
0
    if (m_param->interRefine == 1)
2979
0
        split = (m_param->scaleFactor && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && (!mightNotSplit ||
2980
0
                (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2981
0
    else
2982
0
        split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
2983
0
                (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
2984
0
    td.split = split;
2985
2986
0
    if ((bDecidedDepth && mightNotSplit) || (m_param->bAnalysisType == HEVC_INFO && parentCTU.m_cuDepth[cuGeom.absPartIdx] == 4))
2987
0
    {
2988
0
        setLambdaFromQP(parentCTU, qp, lqp);
2989
2990
0
        Mode& mode = md.pred[0];
2991
0
        md.bestMode = &mode;
2992
0
        mode.cu.initSubCU(parentCTU, cuGeom, qp);
2993
0
        PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
2994
0
        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
2995
0
        {
2996
0
            if (m_param->intraRefine == 4)
2997
0
                compressIntraCU(parentCTU, cuGeom, qp);
2998
0
            else
2999
0
            {
3000
0
                bool reuseModes = !((m_param->intraRefine == 3) ||
3001
0
                    (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
3002
0
                if (reuseModes)
3003
0
                {
3004
0
                    memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
3005
0
                    memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
3006
0
                }
3007
0
                checkIntra(mode, cuGeom, size);
3008
0
            }
3009
0
        }
3010
0
        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
3011
0
        {
3012
0
            mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
3013
0
            uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
3014
0
            for (uint32_t part = 0; part < numPU; part++)
3015
0
            {
3016
0
                PredictionUnit pu(mode.cu, cuGeom, part);
3017
0
                if (m_param->analysisLoadReuseLevel == 10 || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel >= 7))
3018
0
                {
3019
0
                    x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
3020
0
                    int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
3021
0
                    mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
3022
0
                    mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
3023
0
                    for (int list = 0; list < m_slice->isInterB() + 1; list++)
3024
0
                    {
3025
0
                        mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
3026
0
                        mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
3027
0
                        mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
3028
0
                    }
3029
0
                    if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
3030
0
                    {
3031
0
                        if (m_param->interRefine == 1)
3032
0
                            m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
3033
                        //AMVP
3034
0
                        MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
3035
0
                        mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours);
3036
0
                        for (int list = 0; list < m_slice->isInterB() + 1; list++)
3037
0
                        {
3038
0
                            int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
3039
0
                            if (ref == -1)
3040
0
                                continue;
3041
0
                            MV mvp;
3042
3043
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
3044
                            int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc, part, pu.puAbsPartIdx);
3045
#else
3046
0
                            int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
3047
0
#endif
3048
0
                            mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
3049
0
                            if (m_param->interRefine == 1)
3050
0
                            {
3051
0
                                MV outmv, mvpSelect[3];
3052
0
                                mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word;
3053
0
                                if (m_param->mvRefine > 1)
3054
0
                                {
3055
0
                                    mvpSelect[1] = mvp;
3056
0
                                    if(m_param->mvRefine > 2)
3057
0
                                        mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])];
3058
0
                                }
3059
0
                                searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc);
3060
0
                                mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
3061
0
                            }
3062
0
                            mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
3063
0
                        }
3064
0
                    }
3065
0
                    else
3066
0
                    {
3067
0
                        MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3068
0
                        uint8_t candDir[MRG_MAX_NUM_CANDS];
3069
0
                        mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
3070
0
                        uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
3071
0
                        if (mode.cu.isBipredRestriction())
3072
0
                        {
3073
                            /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
3074
0
                            if (candDir[mvpIdx] == 3)
3075
0
                            {
3076
0
                                candDir[mvpIdx] = 1;
3077
0
                                candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
3078
0
                            }
3079
0
                        }
3080
0
                        mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
3081
0
                        mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
3082
0
                        mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
3083
0
                        mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part);
3084
0
                        mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part);
3085
0
                    }
3086
0
                }
3087
0
                motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3088
0
            }
3089
0
            if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
3090
0
                encodeResAndCalcRdSkipCU(mode);
3091
0
            else
3092
0
                encodeResAndCalcRdInterCU(mode, cuGeom);
3093
3094
            /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
3095
0
            bool mergeInter2Nx2N = size == SIZE_2Nx2N && mode.cu.m_mergeFlag[0];
3096
0
            if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
3097
0
                checkDQP(mode, cuGeom);
3098
0
        }
3099
3100
0
        if (m_refineLevel < 2)
3101
0
        {
3102
0
            if (m_bTryLossless)
3103
0
                tryLossless(cuGeom);
3104
3105
0
            if (mightSplit)
3106
0
                addSplitFlagCost(*md.bestMode, cuGeom.depth);
3107
3108
0
            if (mightSplit && m_param->rdLevel < 5)
3109
0
                checkDQPForSplitPred(*md.bestMode, cuGeom);
3110
0
        }
3111
3112
0
        if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7)
3113
0
        {
3114
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
3115
0
            {
3116
0
                m_modeFlag[list] = true;
3117
0
                if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16)
3118
0
                    m_checkMergeAndSkipOnly[list] = true;
3119
0
            }
3120
0
            m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
3121
0
            for (int list = 0; list < m_slice->isInterB() + 1; list++)
3122
0
            {
3123
0
                m_modeFlag[list] = false;
3124
0
                m_checkMergeAndSkipOnly[list] = false;
3125
0
            }
3126
0
        }
3127
3128
0
        if (m_param->bDynamicRefine)
3129
0
            classifyCU(parentCTU,cuGeom, *md.bestMode, td);
3130
3131
0
        if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
3132
0
        {
3133
0
            if ((m_slice->m_origSliceType != I_SLICE))
3134
0
            {
3135
0
                if (parentCTU.m_cuDepth[cuGeom.absPartIdx] < 4 && mightNotSplit)
3136
0
                    m_evaluateInter = 1;
3137
0
                else
3138
0
                    bDecidedDepth = true;
3139
0
                m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
3140
0
                m_evaluateInter = 0;
3141
0
            }
3142
0
            else
3143
0
            {
3144
0
                compressIntraCU(parentCTU, cuGeom, qp);
3145
0
            }
3146
0
        }
3147
0
    }
3148
0
    if (!bDecidedDepth || split)
3149
0
    {
3150
0
        Mode* splitPred = &md.pred[PRED_SPLIT];
3151
0
        if (!split)
3152
0
            md.bestMode = splitPred;
3153
0
        splitPred->initCosts();
3154
0
        CUData* splitCU = &splitPred->cu;
3155
0
        splitCU->initSubCU(parentCTU, cuGeom, qp);
3156
3157
0
        uint32_t nextDepth = depth + 1;
3158
0
        ModeDepth& nd = m_modeDepth[nextDepth];
3159
0
        invalidateContexts(nextDepth);
3160
0
        Entropy* nextContext = &m_rqt[depth].cur;
3161
0
        int nextQP = qp;
3162
3163
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
3164
0
        {
3165
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
3166
0
            if (childGeom.flags & CUGeom::PRESENT)
3167
0
            {
3168
0
                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
3169
0
                m_rqt[nextDepth].cur.load(*nextContext);
3170
3171
0
                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
3172
0
                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
3173
3174
0
                int lamdaQP = (m_param->analysisLoadReuseLevel >= 7) ? nextQP : lqp;
3175
3176
0
                if (split)
3177
0
                    m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
3178
0
                else
3179
0
                    qprdRefine(parentCTU, childGeom, nextQP, lamdaQP);
3180
3181
                // Save best CU and pred data for this sub CU
3182
0
                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
3183
0
                splitPred->addSubCosts(*nd.bestMode);
3184
0
                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
3185
0
                nextContext = &nd.bestMode->contexts;
3186
0
            }
3187
0
            else
3188
0
            {
3189
0
                splitCU->setEmptyPart(childGeom, subPartIdx);
3190
                // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
3191
0
                memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
3192
0
            }
3193
0
        }
3194
0
        nextContext->store(splitPred->contexts);
3195
0
        if (mightNotSplit)
3196
0
            addSplitFlagCost(*splitPred, cuGeom.depth);
3197
0
        else
3198
0
            updateModeCost(*splitPred);
3199
3200
0
        if (m_refineLevel)
3201
0
        {
3202
0
            if (m_param->rdLevel > 1)
3203
0
                checkBestMode(*splitPred, cuGeom.depth);
3204
0
            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
3205
0
                md.bestMode = splitPred;
3206
0
        }
3207
3208
0
        checkDQPForSplitPred(*splitPred, cuGeom);
3209
3210
        /* Copy best data to encData CTU and recon */
3211
0
        md.bestMode->cu.copyToPic(depth);
3212
0
        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic[0], parentCTU.m_cuAddr, cuGeom.absPartIdx);
3213
0
    }
3214
0
    if (m_param->bDynamicRefine && bDecidedDepth)
3215
0
        trainCU(parentCTU, cuGeom, *md.bestMode, td);
3216
0
}
3217
3218
void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
3219
0
{
3220
0
    uint32_t depth = cuGeom.depth;
3221
0
    trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
3222
0
    if (m_frame->m_classifyFrame)
3223
0
    {
3224
0
        uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
3225
0
        uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
3226
0
        float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
3227
0
        uint8_t varRefineLevel = 1;
3228
0
        uint8_t rdRefineLevel = 1;
3229
0
        uint64_t cuCost = bestMode.rdCost;
3230
0
        int offset = (depth * X265_REFINE_INTER_LEVELS);
3231
0
        if (cuCost < m_frame->m_classifyRd[offset])
3232
0
            m_refineLevel = 1;
3233
0
        else
3234
0
        {
3235
0
            uint64_t trainingCount = 0;
3236
0
            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
3237
0
            {
3238
0
                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
3239
0
                trainingCount += m_frame->m_classifyCount[offset];
3240
0
            }
3241
0
            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
3242
0
            {
3243
0
                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
3244
                /* Calculate distance values */
3245
0
                diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
3246
0
                diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
3247
3248
                /* Calculate prior probability - ranges between 0 and 1 */
3249
0
                if (trainingCount)
3250
0
                    probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
3251
3252
                /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
3253
                P(c|x) is the posterior probability of class given predictor.
3254
                P(c) is the prior probability of class.
3255
                P(x|c) is the likelihood which is the probability of predictor given class.
3256
                P(x) is the prior probability of predictor.*/
3257
0
                int curRefineLevel = m_refineLevel - 1;
3258
0
                if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
3259
0
                    varRefineLevel = i + 1;
3260
0
                if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
3261
0
                    rdRefineLevel = i + 1;
3262
0
            }
3263
0
            m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
3264
0
        }
3265
0
    }
3266
0
}
3267
3268
void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
3269
0
{
3270
0
    uint32_t depth = cuGeom.depth;
3271
0
    int classify = 1;
3272
0
    if (!m_frame->m_classifyFrame)
3273
0
    {
3274
        /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
3275
                          and CUs that has split.
3276
           classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
3277
           classify = 3 : CUs encoded as any other mode. */
3278
3279
0
        bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
3280
0
            trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
3281
0
            trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
3282
0
        bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
3283
0
        if (refineInter0 || refineInter1)
3284
0
            classify = 1;
3285
0
        else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
3286
0
            classify = 2;
3287
0
        else
3288
0
            classify = 3;
3289
0
    }
3290
0
    else
3291
0
        classify = m_refineLevel;
3292
0
    uint64_t cuCost = bestMode.rdCost;
3293
0
    int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
3294
0
    ctu.m_collectCURd[offset] += cuCost;
3295
0
    ctu.m_collectCUVariance[offset] += trainData.cuVariance;
3296
0
    ctu.m_collectCUCount[offset]++;
3297
0
}
3298
3299
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
3300
void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
3301
0
{
3302
0
    uint32_t depth = cuGeom.depth;
3303
0
    ModeDepth& md = m_modeDepth[depth];
3304
0
    Yuv *fencYuv = &md.fencYuv;
3305
3306
    /* Note that these two Mode instances are named MERGE and SKIP but they may
3307
     * hold the reverse when the function returns. We toggle between the two modes */
3308
0
    Mode* tempPred = &merge;
3309
0
    Mode* bestPred = &skip;
3310
3311
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
3312
3313
0
    tempPred->initCosts();
3314
0
    tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
3315
0
    tempPred->cu.setPredModeSubParts(MODE_INTER);
3316
0
    tempPred->cu.m_mergeFlag[0] = true;
3317
3318
0
    bestPred->initCosts();
3319
0
    bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
3320
0
    bestPred->cu.setPredModeSubParts(MODE_INTER);
3321
0
    bestPred->cu.m_mergeFlag[0] = true;
3322
3323
0
    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3324
0
    uint8_t candDir[MRG_MAX_NUM_CANDS];
3325
0
    uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
3326
0
    PredictionUnit pu(merge.cu, cuGeom, 0);
3327
3328
0
    bestPred->sa8dCost = MAX_INT64;
3329
0
    int bestSadCand = -1;
3330
0
    int sizeIdx = cuGeom.log2CUSize - 2;
3331
0
    int safeX, maxSafeMv;
3332
0
    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
3333
0
    {
3334
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
3335
0
        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
3336
0
    }
3337
0
    for (uint32_t i = 0; i < numMergeCand; ++i)
3338
0
    {
3339
0
        if (m_bFrameParallel)
3340
0
        {
3341
            // Parallel slices bound check
3342
0
            if (m_param->maxSlices > 1)
3343
0
            {
3344
                // NOTE: First row in slice can't negative
3345
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y < m_sliceMinY)
3346
0
                    continue;
3347
3348
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y < m_sliceMinY)
3349
0
                    continue;
3350
3351
                // Last row in slice can't reference beyond bound since it is another slice area
3352
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
3353
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y > m_sliceMaxY)
3354
0
                    continue;
3355
3356
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y > m_sliceMaxY)
3357
0
                    continue;
3358
0
            }
3359
3360
0
            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
3361
0
                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
3362
0
                continue;
3363
0
        }
3364
3365
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
3366
0
            tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
3367
0
            candMvField[i][0].mv.x > maxSafeMv)
3368
            // skip merge candidates which reference beyond safe reference area
3369
0
            continue;
3370
3371
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
3372
0
        X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
3373
0
        tempPred->cu.m_interDir[0] = candDir[i];
3374
0
        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
3375
0
        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
3376
0
        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
3377
0
        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
3378
0
        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3379
3380
0
        tempPred->sa8dBits = getTUBits(i, numMergeCand);
3381
0
        tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
3382
0
        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3383
0
        {
3384
0
            tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
3385
0
            tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
3386
0
        }
3387
0
        tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);
3388
3389
0
        if (tempPred->sa8dCost < bestPred->sa8dCost)
3390
0
        {
3391
0
            bestSadCand = i;
3392
0
            std::swap(tempPred, bestPred);
3393
0
        }
3394
0
    }
3395
3396
    /* force mode decision to take inter or intra */
3397
0
    if (bestSadCand < 0)
3398
0
        return;
3399
3400
    /* calculate the motion compensation for chroma for the best mode selected */
3401
0
    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
3402
0
        motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
3403
3404
0
    if (m_param->rdLevel)
3405
0
    {
3406
0
        if (m_param->bLossless)
3407
0
            bestPred->rdCost = MAX_INT64;
3408
0
        else
3409
0
            encodeResAndCalcRdSkipCU(*bestPred);
3410
3411
        /* Encode with residual */
3412
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
3413
0
        tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
3414
0
        tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
3415
0
        tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
3416
0
        tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
3417
0
        tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
3418
0
        tempPred->sa8dCost = bestPred->sa8dCost;
3419
0
        tempPred->sa8dBits = bestPred->sa8dBits;
3420
0
        tempPred->predYuv.copyFromYuv(bestPred->predYuv);
3421
3422
0
        encodeResAndCalcRdInterCU(*tempPred, cuGeom);
3423
3424
0
        md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
3425
0
    }
3426
0
    else
3427
0
        md.bestMode = bestPred;
3428
3429
    /* broadcast sets of MV field data */
3430
0
    md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
3431
0
    md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
3432
0
    md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
3433
0
    md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
3434
0
    md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
3435
0
    checkDQP(*md.bestMode, cuGeom);
3436
0
}
3437
3438
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
3439
void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
3440
0
{
3441
0
    uint32_t depth = cuGeom.depth;
3442
3443
    /* Note that these two Mode instances are named MERGE and SKIP but they may
3444
     * hold the reverse when the function returns. We toggle between the two modes */
3445
0
    Mode* tempPred = &merge;
3446
0
    Mode* bestPred = &skip;
3447
3448
0
    merge.initCosts();
3449
0
    merge.cu.setPredModeSubParts(MODE_INTER);
3450
0
    merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
3451
0
    merge.cu.m_mergeFlag[0] = true;
3452
3453
0
    skip.initCosts();
3454
0
    skip.cu.setPredModeSubParts(MODE_INTER);
3455
0
    skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
3456
0
    skip.cu.m_mergeFlag[0] = true;
3457
3458
0
    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3459
0
    uint8_t candDir[MRG_MAX_NUM_CANDS];
3460
0
    uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
3461
#if ENABLE_SCC_EXT
3462
    restrictBipredMergeCand(&merge.cu, 0, candMvField, candDir, numMergeCand);
3463
#endif
3464
3465
0
    PredictionUnit pu(merge.cu, cuGeom, 0);
3466
3467
0
    bool foundCbf0Merge = false;
3468
0
    bool triedPZero = false, triedBZero = false;
3469
0
    bestPred->rdCost = MAX_INT64;
3470
3471
0
    int safeX, maxSafeMv;
3472
0
    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
3473
0
    {
3474
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
3475
0
        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
3476
0
    }
3477
0
    for (uint32_t i = 0; i < numMergeCand; i++)
3478
0
    {
3479
0
        if (m_bFrameParallel)
3480
0
        {
3481
            // Parallel slices bound check
3482
0
            if (m_param->maxSlices > 1)
3483
0
            {
3484
                // NOTE: First row in slice can't negative
3485
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y < m_sliceMinY)
3486
0
                    continue;
3487
3488
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y < m_sliceMinY)
3489
0
                    continue;
3490
3491
                // Last row in slice can't reference beyond bound since it is another slice area
3492
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
3493
0
                if (candMvField[i][0].refIdx >= 0 && candMvField[i][0].mv.notZero() && candMvField[i][0].mv.y > m_sliceMaxY)
3494
0
                    continue;
3495
3496
0
                if (candMvField[i][1].refIdx >= 0 && candMvField[i][1].mv.notZero() && candMvField[i][1].mv.y > m_sliceMaxY)
3497
0
                    continue;
3498
0
            }
3499
3500
0
            if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
3501
0
                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)
3502
0
                continue;
3503
0
        }
3504
3505
        /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
3506
0
        if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
3507
0
        {
3508
0
            if (triedPZero)
3509
0
                continue;
3510
0
            triedPZero = true;
3511
0
        }
3512
0
        else if (candDir[i] == 3 &&
3513
0
            !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
3514
0
            !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
3515
0
        {
3516
0
            if (triedBZero)
3517
0
                continue;
3518
0
            triedBZero = true;
3519
0
        }
3520
0
        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
3521
0
            tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
3522
0
            candMvField[i][0].mv.x > maxSafeMv)
3523
            // skip merge candidates which reference beyond safe reference area
3524
0
            continue;
3525
#if ENABLE_SCC_EXT
3526
        if ((candDir[i] == 1 || candDir[i] == 3) && (m_slice->m_refPOCList[0][candMvField[i][0].refIdx] == m_slice->m_poc))
3527
        {
3528
            continue;
3529
        }
3530
#endif
3531
0
        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
3532
0
        tempPred->cu.m_interDir[0] = candDir[i];
3533
0
        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
3534
0
        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
3535
0
        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
3536
0
        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
3537
0
        tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
3538
3539
0
        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
3540
3541
0
        uint8_t hasCbf = true;
3542
0
        bool swapped = false;
3543
0
        if (!foundCbf0Merge)
3544
0
        {
3545
            /* if the best prediction has CBF (not a skip) then try merge with residual */
3546
3547
0
            encodeResAndCalcRdInterCU(*tempPred, cuGeom);
3548
0
            hasCbf = tempPred->cu.getQtRootCbf(0);
3549
0
            foundCbf0Merge = !hasCbf;
3550
3551
0
            if (tempPred->rdCost < bestPred->rdCost)
3552
0
            {
3553
0
                std::swap(tempPred, bestPred);
3554
0
                swapped = true;
3555
0
            }
3556
0
        }
3557
0
        if (!m_param->bLossless && hasCbf)
3558
0
        {
3559
            /* try merge without residual (skip), if not lossless coding */
3560
3561
0
            if (swapped)
3562
0
            {
3563
0
                tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
3564
0
                tempPred->cu.m_interDir[0] = candDir[i];
3565
0
                tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
3566
0
                tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
3567
0
                tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
3568
0
                tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
3569
0
                tempPred->cu.setPredModeSubParts(MODE_INTER);
3570
0
                tempPred->predYuv.copyFromYuv(bestPred->predYuv);
3571
0
            }
3572
3573
0
            encodeResAndCalcRdSkipCU(*tempPred);
3574
3575
0
            if (tempPred->rdCost < bestPred->rdCost)
3576
0
                std::swap(tempPred, bestPred);
3577
0
        }
3578
0
    }
3579
3580
0
    if (bestPred->rdCost < MAX_INT64)
3581
0
    {
3582
0
        m_modeDepth[depth].bestMode = bestPred;
3583
3584
        /* broadcast sets of MV field data */
3585
0
        uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
3586
0
        bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
3587
0
        bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
3588
0
        bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
3589
0
        bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
3590
0
        bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
3591
0
        checkDQP(*bestPred, cuGeom);
3592
0
    }
3593
0
}
3594
3595
#if ENABLE_SCC_EXT
3596
void Analysis::checkRDCostIntraBCMerge2Nx2N(Mode& mergeIBC, const CUGeom& cuGeom)
3597
{
3598
    mergeIBC.initCosts();
3599
    MVField  cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
3600
    uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
3601
    uint32_t numValidMergeCand = 0;
3602
    CUData cu = mergeIBC.cu;
3603
    PredictionUnit pu(mergeIBC.cu, cuGeom, 0);
3604
    mergeIBC.rdCost = MAX_INT64;
3605
    for (uint32_t ui = 0; ui < m_slice->m_maxNumMergeCand; ++ui)
3606
    {
3607
        interDirNeighbours[ui] = 0;
3608
    }
3609
    int8_t org_qp;
3610
    int xPos = cu.m_cuPelX;
3611
    int yPos = cu.m_cuPelY;
3612
    int width = 1 << cu.m_log2CUSize[0];
3613
    int height = 1 << cu.m_log2CUSize[0];
3614
    uint8_t depth = cu.m_cuDepth[0];
3615
    mergeIBC.cu.setPartSizeSubParts(SIZE_2Nx2N);
3616
    Mode tempPred = m_modeDepth[depth].pred[PRED_MERGE_IBC];
3617
3618
    numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, interDirNeighbours);
3619
    cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand);
3620
    restrictBipredMergeCand(&cu, 0, cMvFieldNeighbours, interDirNeighbours, numValidMergeCand);
3621
3622
    for (uint8_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
3623
    {
3624
        if (interDirNeighbours[mergeCand] != 1)
3625
        {
3626
            continue;
3627
        }
3628
3629
        if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mergeCand][0].refIdx] != m_slice->m_poc)
3630
        {
3631
            continue;
3632
        }
3633
3634
        if (!isBlockVectorValid(xPos, yPos, width, height, &cu,
3635
            0, 0, (cMvFieldNeighbours[mergeCand][0].mv.x >> 2), (cMvFieldNeighbours[mergeCand][0].mv.y >> 2), m_param->maxCUSize))
3636
        {
3637
            continue;
3638
        }
3639
3640
        // set MC parameters
3641
        cu.setPredModeSubParts(MODE_INTER);
3642
        cu.setPartSizeSubParts(SIZE_2Nx2N);
3643
        cu.m_mergeFlag[0] = true;
3644
        cu.m_mvpIdx[0][0] = mergeCand;
3645
        cu.setPUInterDir(interDirNeighbours[mergeCand], 0, 0);
3646
        cu.setPUMv(0, cMvFieldNeighbours[mergeCand][0].mv, 0, 0);
3647
        cu.setPUMv(1, cMvFieldNeighbours[mergeCand][1].mv, 0, 0);
3648
        cu.setPURefIdx(0, (int8_t)cMvFieldNeighbours[mergeCand][0].refIdx, 0, 0);
3649
        cu.setPURefIdx(1, (int8_t)cMvFieldNeighbours[mergeCand][1].refIdx, 0, 0);
3650
        motionCompensation(cu, pu, mergeIBC.predYuv, true, m_csp != X265_CSP_I400);
3651
3652
        org_qp = cu.m_qp[0];
3653
        encodeResAndCalcRdInterCU(mergeIBC, cuGeom);
3654
        if (mergeIBC.rdCost < tempPred.rdCost)
3655
            std::swap(mergeIBC, tempPred);
3656
        cu.setQPSubParts(org_qp, 0, depth);
3657
    }
3658
    std::swap(tempPred, mergeIBC);
3659
    checkBestMode(mergeIBC, depth);
3660
    checkDQP(mergeIBC, cuGeom);
3661
}
3662
#endif
3663
3664
void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3665
0
{
3666
0
    interMode.initCosts();
3667
0
    interMode.cu.setPartSizeSubParts(partSize);
3668
0
    interMode.cu.setPredModeSubParts(MODE_INTER);
3669
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3670
3671
0
    if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3672
0
    {
3673
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3674
0
        int index = 0;
3675
3676
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3677
0
        for (uint32_t part = 0; part < numPU; part++)
3678
0
        {
3679
0
            MotionData* bestME = interMode.bestME[part];
3680
0
            for (int32_t i = 0; i < numPredDir; i++)
3681
0
                bestME[i].ref = m_reuseRef[refOffset + index++];
3682
0
        }
3683
0
    }
3684
3685
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3686
0
    {
3687
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3688
0
        for (uint32_t part = 0; part < numPU; part++)
3689
0
        {
3690
0
            MotionData* bestME = interMode.bestME[part];
3691
0
            for (int32_t i = 0; i < numPredDir; i++)
3692
0
            {
3693
0
                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3694
0
                bestME[i].ref = ref[cuGeom.absPartIdx];
3695
0
                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3696
0
                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3697
0
            }
3698
0
        }
3699
0
    }
3700
0
    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
3701
3702
    /* predInterSearch sets interMode.sa8dBits */
3703
0
    const Yuv& fencYuv = *interMode.fencYuv;
3704
0
    Yuv& predYuv = interMode.predYuv;
3705
0
    int part = partitionFromLog2Size(cuGeom.log2CUSize);
3706
0
    interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
3707
0
    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3708
0
    {
3709
0
        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
3710
0
        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
3711
0
    }
3712
0
    interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);
3713
3714
0
    if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3715
0
    {
3716
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3717
0
        int index = 0;
3718
3719
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3720
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3721
0
        {
3722
0
            MotionData* bestME = interMode.bestME[puIdx];
3723
0
            for (int32_t i = 0; i < numPredDir; i++)
3724
0
                m_reuseRef[refOffset + index++] = bestME[i].ref;
3725
0
        }
3726
0
    }
3727
0
}
3728
3729
#if ENABLE_SCC_EXT
3730
void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2], MV* iMVCandList)
3731
#else
3732
void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
3733
#endif
3734
0
{
3735
0
    interMode.initCosts();
3736
0
    interMode.cu.setPartSizeSubParts(partSize);
3737
0
    interMode.cu.setPredModeSubParts(MODE_INTER);
3738
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
3739
3740
0
    if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU)
3741
0
    {
3742
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3743
0
        int index = 0;
3744
3745
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3746
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3747
0
        {
3748
0
            MotionData* bestME = interMode.bestME[puIdx];
3749
0
            for (int32_t i = 0; i < numPredDir; i++)
3750
0
                bestME[i].ref = m_reuseRef[refOffset + index++];
3751
0
        }
3752
0
    }
3753
3754
0
    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
3755
0
    {
3756
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3757
0
        for (uint32_t part = 0; part < numPU; part++)
3758
0
        {
3759
0
            MotionData* bestME = interMode.bestME[part];
3760
0
            for (int32_t i = 0; i < numPredDir; i++)
3761
0
            {
3762
0
                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
3763
0
                bestME[i].ref = ref[cuGeom.absPartIdx];
3764
0
                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
3765
0
                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
3766
0
            }
3767
0
        }
3768
0
    }
3769
3770
#if ENABLE_SCC_EXT
3771
    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask, iMVCandList);
3772
#else
3773
0
    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
3774
0
#endif
3775
3776
    /* predInterSearch sets interMode.sa8dBits, but this is ignored */
3777
0
    encodeResAndCalcRdInterCU(interMode, cuGeom);
3778
3779
0
    if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU)
3780
0
    {
3781
0
        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
3782
0
        int index = 0;
3783
3784
0
        uint32_t numPU = interMode.cu.getNumPartInter(0);
3785
0
        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
3786
0
        {
3787
0
            MotionData* bestME = interMode.bestME[puIdx];
3788
0
            for (int32_t i = 0; i < numPredDir; i++)
3789
0
                m_reuseRef[refOffset + index++] = bestME[i].ref;
3790
0
        }
3791
0
    }
3792
0
}
3793
3794
#if ENABLE_SCC_EXT
3795
void Analysis::checkIntraBC_rd5_6(Mode& intraBCMode, const CUGeom& cuGeom, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc, MV* iMVCandList)
3796
{
3797
    intraBCMode.initCosts();
3798
    intraBCMode.cu.setPartSizeSubParts(ePartSize);
3799
    intraBCMode.cu.setPredModeSubParts(MODE_INTER);
3800
    intraBCMode.cu.setLumaIntraDirSubParts(DC_IDX, 0, cuGeom.depth);
3801
    intraBCMode.cu.setChromIntraDirSubParts(DC_IDX, 0, cuGeom.depth);
3802
    for (int i = 0; i < 2; i++)
3803
        intraBCMode.cu.m_lastIntraBCMv[i] = ibc.m_lastIntraBCMv[i];
3804
3805
    bool bValid = predIntraBCSearch(intraBCMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, ePartSize, testOnlyPred, bUse1DSearchFor8x8, ibc);
3806
    if (bValid)
3807
        encodeResAndCalcRdInterCU(intraBCMode, cuGeom);
3808
    else
3809
        intraBCMode.rdCost = UINT64_MAX;
3810
3811
    if (bValid && (intraBCMode.cu.m_log2CUSize[0] <= 4) && (intraBCMode.cu.m_partSize[0] == SIZE_2NxN || intraBCMode.cu.m_partSize[0] == SIZE_Nx2N))
3812
    {
3813
        int dummyWidth, dummyHeight;
3814
        uint32_t partAddr = 0;
3815
        intraBCMode.cu.getPartIndexAndSize(1, partAddr, dummyWidth, dummyHeight);
3816
        iMVCandList[0] = intraBCMode.cu.m_mv[0][0];
3817
        iMVCandList[1] = intraBCMode.cu.m_mv[0][partAddr];
3818
    }
3819
}
3820
#endif
3821
3822
void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
3823
0
{
3824
0
    CUData& cu = bidir2Nx2N.cu;
3825
3826
#if ENABLE_SCC_EXT
3827
    if ((cu.is8x8BipredRestriction(inter2Nx2N.bestME[0][0].mv, inter2Nx2N.bestME[0][1].mv, inter2Nx2N.bestME[0][0].ref, inter2Nx2N.bestME[0][1].ref) ? (1 << cu.m_log2CUSize[0] == 8) : cu.isBipredRestriction()) || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3828
#else
3829
0
    if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
3830
0
#endif
3831
0
    {
3832
0
        bidir2Nx2N.sa8dCost = MAX_INT64;
3833
0
        bidir2Nx2N.rdCost = MAX_INT64;
3834
0
        return;
3835
0
    }
3836
3837
0
    const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
3838
0
    MV   mvzero(0, 0);
3839
0
    int  partEnum = cuGeom.log2CUSize - 2;
3840
3841
0
    bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
3842
0
    bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
3843
0
    MotionData* bestME = bidir2Nx2N.bestME[0];
3844
0
    int ref0    = bestME[0].ref;
3845
0
    MV  mvp0    = bestME[0].mvp;
3846
0
    int mvpIdx0 = bestME[0].mvpIdx;
3847
0
    int ref1    = bestME[1].ref;
3848
0
    MV  mvp1    = bestME[1].mvp;
3849
0
    int mvpIdx1 = bestME[1].mvpIdx;
3850
3851
0
    bidir2Nx2N.initCosts();
3852
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
3853
0
    cu.setPredModeSubParts(MODE_INTER);
3854
0
    cu.setPUInterDir(3, 0, 0);
3855
0
    cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
3856
0
    cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
3857
0
    cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3858
0
    cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3859
0
    cu.m_mergeFlag[0] = 0;
3860
3861
    /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
3862
0
    cu.setPUMv(0, bestME[0].mv, 0, 0);
3863
0
    cu.m_mvd[0][0] = bestME[0].mv - mvp0;
3864
3865
0
    cu.setPUMv(1, bestME[1].mv, 0, 0);
3866
0
    cu.m_mvd[1][0] = bestME[1].mv - mvp1;
3867
3868
0
    PredictionUnit pu(cu, cuGeom, 0);
3869
0
    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
3870
3871
0
    int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
3872
0
    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3873
0
    {
3874
        /* Add in chroma distortion */
3875
0
        sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
3876
0
        sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
3877
0
    }
3878
0
    bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3879
0
    bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
3880
3881
0
    bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
3882
0
    if (bTryZero)
3883
0
    {
3884
        /* Do not try zero MV if unidir motion predictors are beyond
3885
         * valid search area */
3886
0
        MV mvmin, mvmax;
3887
0
        int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
3888
0
        setSearchRange(cu, mvzero, merange, mvmin, mvmax);
3889
0
        mvmax.y += 2; // there is some pad for subpel refine
3890
0
        mvmin <<= 2;
3891
0
        mvmax <<= 2;
3892
3893
0
        bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
3894
0
        bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
3895
0
    }
3896
0
    if (bTryZero)
3897
0
    {
3898
        /* Estimate cost of BIDIR using coincident blocks */
3899
0
        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
3900
3901
0
        int zsa8d;
3902
3903
0
        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3904
0
        {
3905
0
            cu.m_mv[0][0] = mvzero;
3906
0
            cu.m_mv[1][0] = mvzero;
3907
3908
0
            motionCompensation(cu, pu, tmpPredYuv, true, true);
3909
0
            zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3910
0
            zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
3911
0
            zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
3912
3913
0
        }
3914
0
        else
3915
0
        {
3916
0
            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3917
0
            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
3918
0
            intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
3919
0
            primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
3920
0
            zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3921
0
        }
3922
0
        uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
3923
0
        uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
3924
0
        uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
3925
3926
        /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
3927
0
        mvp0 = checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvpIdx0, bits0, zcost);
3928
0
        mvp1 = checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvpIdx1, bits1, zcost);
3929
3930
0
        uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3931
0
        zcost = zsa8d + m_rdCost.getCost(zbits);
3932
3933
0
        if (zcost < bidir2Nx2N.sa8dCost)
3934
0
        {
3935
0
            bidir2Nx2N.sa8dBits = zbits;
3936
0
            bidir2Nx2N.sa8dCost = zcost;
3937
3938
0
            cu.setPUMv(0, mvzero, 0, 0);
3939
0
            cu.m_mvd[0][0] = mvzero - mvp0;
3940
0
            cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
3941
3942
0
            cu.setPUMv(1, mvzero, 0, 0);
3943
0
            cu.m_mvd[1][0] = mvzero - mvp1;
3944
0
            cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
3945
3946
0
            if (m_bChromaSa8d) /* real MC was already performed */
3947
0
                bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
3948
0
            else
3949
0
                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400);
3950
0
        }
3951
0
        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
3952
0
        {
3953
            /* recover overwritten motion vectors */
3954
0
            cu.m_mv[0][0] = bestME[0].mv;
3955
0
            cu.m_mv[1][0] = bestME[1].mv;
3956
0
        }
3957
0
    }
3958
0
}
3959
3960
void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
3961
0
{
3962
0
    if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth)
3963
0
    {
3964
0
        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
3965
0
        {
3966
0
            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
3967
0
            if (childGeom.flags & CUGeom::PRESENT)
3968
0
                encodeResidue(ctu, childGeom);
3969
0
        }
3970
0
        return;
3971
0
    }
3972
3973
0
    uint32_t absPartIdx = cuGeom.absPartIdx;
3974
0
    int sizeIdx = cuGeom.log2CUSize - 2;
3975
3976
    /* reuse the bestMode data structures at the current depth */
3977
0
    Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
3978
0
    CUData& cu = bestMode->cu;
3979
3980
0
    cu.copyFromPic(ctu, cuGeom, m_csp);
3981
3982
0
    PicYuv& reconPic = *m_frame->m_reconPic[0];
3983
3984
0
    Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
3985
0
    if (cuGeom.depth)
3986
0
        m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
3987
0
    X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
3988
3989
0
    if (cu.isIntra(0))
3990
0
    {
3991
0
        ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough
3992
        
3993
0
        uint32_t tuDepthRange[2];
3994
0
        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
3995
3996
0
        residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
3997
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
3998
0
        {
3999
0
            getBestIntraModeChroma(*bestMode, cuGeom);
4000
0
            residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
4001
0
        }
4002
0
    }
4003
0
    else // if (cu.isInter(0))
4004
0
    {
4005
0
        ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough
4006
4007
0
        X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
4008
4009
        /* Calculate residual for current CU part into depth sized resiYuv */
4010
4011
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
4012
4013
        /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
4014
0
        Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
4015
0
        pixel* predY = predYuv.getLumaAddr(absPartIdx);
4016
4017
0
        primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
4018
0
                                      fencYuv.m_buf[0], predY,
4019
0
                                      fencYuv.m_size, predYuv.m_size);
4020
4021
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4022
0
        {
4023
0
            pixel* predU = predYuv.getCbAddr(absPartIdx);
4024
0
            pixel* predV = predYuv.getCrAddr(absPartIdx);
4025
0
            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
4026
0
                                                 fencYuv.m_buf[1], predU,
4027
0
                                                 fencYuv.m_csize, predYuv.m_csize);
4028
4029
0
            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
4030
0
                                                 fencYuv.m_buf[2], predV,
4031
0
                                                 fencYuv.m_csize, predYuv.m_csize);
4032
0
        }
4033
4034
0
        uint32_t tuDepthRange[2];
4035
0
        cu.getInterTUQtDepthRange(tuDepthRange, 0);
4036
4037
0
        residualTransformQuantInter(*bestMode, cuGeom, 0, 0, tuDepthRange);
4038
4039
0
        if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
4040
0
            cu.setPredModeSubParts(MODE_SKIP);
4041
4042
        /* residualTransformQuantInter() wrote transformed residual back into
4043
         * resiYuv. Generate the recon pixels by adding it to the prediction */
4044
4045
0
        if (cu.m_cbf[0][0])
4046
0
        {
4047
0
            bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
4048
0
            bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
4049
0
            primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
4050
0
                (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
4051
0
        }
4052
0
        else
4053
0
            primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
4054
0
                                           predY, predYuv.m_size);
4055
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4056
0
        {
4057
0
             pixel* predU = predYuv.getCbAddr(absPartIdx);
4058
0
             pixel* predV = predYuv.getCrAddr(absPartIdx);
4059
0
             if (cu.m_cbf[1][0])
4060
0
             {
4061
0
                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
4062
0
                 bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
4063
0
                 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
4064
0
                     (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
4065
0
             }
4066
0
            else
4067
0
                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
4068
0
                                                         predU, predYuv.m_csize);
4069
4070
0
            if (cu.m_cbf[2][0])
4071
0
            {
4072
0
                bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
4073
0
                bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
4074
0
                primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
4075
0
                    (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
4076
0
            }
4077
0
            else
4078
0
                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
4079
0
                                                         predV, predYuv.m_csize);
4080
0
        }
4081
0
    }
4082
4083
0
    cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp);
4084
0
}
4085
4086
void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
4087
176k
{
4088
176k
    if (m_param->rdLevel >= 3)
4089
176k
    {
4090
        /* code the split flag (0 or 1) and update bit costs */
4091
176k
        mode.contexts.resetBits();
4092
176k
        mode.contexts.codeSplitFlag(mode.cu, 0, depth);
4093
176k
        uint32_t bits = mode.contexts.getNumberOfWrittenBits();
4094
176k
        mode.totalBits += bits;
4095
176k
        updateModeCost(mode);
4096
176k
    }
4097
0
    else if (m_param->rdLevel <= 1)
4098
0
    {
4099
0
        mode.sa8dBits++;
4100
0
        mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
4101
0
    }
4102
0
    else
4103
0
    {
4104
0
        mode.totalBits++;
4105
0
        updateModeCost(mode);
4106
0
    }
4107
176k
}
4108
4109
uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
4110
0
{
4111
    /* Do not attempt to code a block larger than the largest block in the
4112
     * co-located CTUs in L0 and L1 */
4113
0
    int currentQP = parentCTU.m_qp[0];
4114
0
    int previousQP = currentQP;
4115
0
    uint32_t minDepth0 = 4, minDepth1 = 4;
4116
0
    uint32_t sum = 0;
4117
0
    int numRefs = 0;
4118
0
    int refPresent = (!m_slice->m_param->bEnableSCC && m_slice->m_numRefIdx[0]) || ((!m_slice->m_param->bEnableSCC && (m_slice->m_numRefIdx[0] - 1)));
4119
0
    if (refPresent)
4120
0
    {
4121
0
        numRefs++;
4122
0
        const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
4123
0
        previousQP = cu.m_qp[0];
4124
0
        if (!cu.m_cuDepth[cuGeom.absPartIdx])
4125
0
            return 0;
4126
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
4127
0
        {
4128
0
            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
4129
0
            minDepth0 = X265_MIN(d, minDepth0);
4130
0
            sum += d;
4131
0
        }
4132
0
    }
4133
0
    if (m_slice->m_numRefIdx[1])
4134
0
    {
4135
0
        numRefs++;
4136
0
        const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
4137
0
        if (!cu.m_cuDepth[cuGeom.absPartIdx])
4138
0
            return 0;
4139
0
        for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
4140
0
        {
4141
0
            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
4142
0
            minDepth1 = X265_MIN(d, minDepth1);
4143
0
            sum += d;
4144
0
        }
4145
0
    }
4146
0
    if (!numRefs)
4147
0
        return 0;
4148
4149
0
    uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
4150
0
    uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
4151
4152
    /* allow block size growth if QP is raising or avg depth is
4153
     * less than 1.5 of min depth */
4154
0
    if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
4155
0
        minDepth -= 1;
4156
4157
0
    return minDepth;
4158
0
}
4159
4160
/* returns true if recursion should be stopped */
4161
bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
4162
0
{
4163
    /* early exit when the RD cost of best mode at depth n is less than the sum
4164
     * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
4165
     * left, colocated) and avg cost of that CU at depth "n" with weightage for
4166
     * each quantity */
4167
4168
0
    uint32_t depth = cuGeom.depth;
4169
0
    FrameData& curEncData = *m_frame->m_encData;
4170
0
    FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
4171
0
    uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
4172
0
    uint64_t cuCount = cuStat.count[depth];
4173
4174
0
    uint64_t neighCost = 0, neighCount = 0;
4175
0
    const CUData* above = parentCTU.m_cuAbove;
4176
0
    if (above)
4177
0
    {
4178
0
        FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
4179
0
        neighCost += astat.avgCost[depth] * astat.count[depth];
4180
0
        neighCount += astat.count[depth];
4181
4182
0
        const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
4183
0
        if (aboveLeft)
4184
0
        {
4185
0
            FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
4186
0
            neighCost += lstat.avgCost[depth] * lstat.count[depth];
4187
0
            neighCount += lstat.count[depth];
4188
0
        }
4189
4190
0
        const CUData* aboveRight = parentCTU.m_cuAboveRight;
4191
0
        if (aboveRight)
4192
0
        {
4193
0
            FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
4194
0
            neighCost += rstat.avgCost[depth] * rstat.count[depth];
4195
0
            neighCount += rstat.count[depth];
4196
0
        }
4197
0
    }
4198
0
    const CUData* left = parentCTU.m_cuLeft;
4199
0
    if (left)
4200
0
    {
4201
0
        FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
4202
0
        neighCost += nstat.avgCost[depth] * nstat.count[depth];
4203
0
        neighCount += nstat.count[depth];
4204
0
    }
4205
4206
    // give 60% weight to all CU's and 40% weight to neighbour CU's
4207
0
    if (neighCount + cuCount)
4208
0
    {
4209
0
        uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
4210
0
        uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
4211
0
        if (curCost < avgCost && avgCost)
4212
0
            return true;
4213
0
    }
4214
4215
0
    return false;
4216
0
}
4217
4218
bool Analysis::complexityCheckCU(const Mode& bestMode)
4219
0
{
4220
0
    if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
4221
0
    {
4222
0
        uint32_t mean = 0;
4223
0
        uint32_t homo = 0;
4224
0
        uint32_t cuSize = bestMode.fencYuv->m_size;
4225
0
        for (uint32_t y = 0; y < cuSize; y++) {
4226
0
            for (uint32_t x = 0; x < cuSize; x++) {
4227
0
                mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
4228
0
            }
4229
0
        }
4230
0
        mean = mean / (cuSize * cuSize);
4231
0
        for (uint32_t y = 0; y < cuSize; y++) {
4232
0
            for (uint32_t x = 0; x < cuSize; x++) {
4233
0
                homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
4234
0
            }
4235
0
        }
4236
0
        homo = homo / (cuSize * cuSize);
4237
4238
0
        if (homo < (.1 * mean))
4239
0
            return true;
4240
4241
0
        return false;
4242
0
    }
4243
0
    else
4244
0
    {
4245
0
        int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
4246
0
        int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
4247
0
        intptr_t stride = m_frame->m_fencPic->m_stride;
4248
0
        intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
4249
0
        uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
4250
0
        uint32_t sum = (uint32_t)sum_ss;
4251
0
        uint32_t ss = (uint32_t)(sum_ss >> 32);
4252
0
        uint32_t pixelCount = 1 << shift;
4253
0
        double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
4254
4255
0
        if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
4256
0
            return false;
4257
0
        else
4258
0
            return true;
4259
0
    }
4260
0
 }
4261
4262
uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
4263
0
{
4264
0
    uint32_t cuVariance = 0;
4265
0
    uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
4266
0
    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
4267
4268
0
    uint32_t width = m_frame->m_fencPic->m_picWidth;
4269
0
    uint32_t height = m_frame->m_fencPic->m_picHeight;
4270
0
    uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
4271
0
    uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
4272
0
    uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
4273
0
    uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
4274
0
    uint32_t cnt = 0; 
4275
4276
0
    for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
4277
0
    {
4278
0
        for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
4279
0
        {
4280
0
            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
4281
0
            cuVariance += blockVariance[idx];
4282
0
            cnt++;
4283
0
        }
4284
0
    }
4285
0
    return cuVariance / cnt;
4286
0
}
4287
4288
double Analysis::aqQPOffset(const CUData& ctu, const CUGeom& cuGeom)
4289
0
{
4290
0
    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
4291
0
    PicQPAdaptationLayer* pQPLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
4292
4293
0
    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pQPLayer->aqPartWidth;
4294
0
    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pQPLayer->aqPartHeight;
4295
4296
0
    uint32_t aqStride = pQPLayer->numAQPartInWidth;
4297
4298
0
    double dQpOffset = pQPLayer->dQpOffset[aqPosY * aqStride + aqPosX];
4299
0
    return dQpOffset;
4300
0
}
4301
4302
double Analysis::cuTreeQPOffset(const CUData& ctu, const CUGeom& cuGeom)
4303
0
{
4304
0
    uint32_t aqDepth = X265_MIN(cuGeom.depth, m_frame->m_lowres.maxAQDepth - 1);
4305
0
    PicQPAdaptationLayer* pcAQLayer = &m_frame->m_lowres.pAQLayer[aqDepth];
4306
4307
0
    uint32_t aqPosX = (ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]) / pcAQLayer->aqPartWidth;
4308
0
    uint32_t aqPosY = (ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]) / pcAQLayer->aqPartHeight;
4309
4310
0
    uint32_t aqStride = pcAQLayer->numAQPartInWidth;
4311
4312
0
    double dQpOffset = pcAQLayer->dCuTreeOffset[aqPosY * aqStride + aqPosX];
4313
0
    return dQpOffset;
4314
0
}
4315
4316
int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
4317
23.5k
{
4318
23.5k
    FrameData& curEncData = *m_frame->m_encData;
4319
23.5k
    double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
4320
23.5k
    bool bCuTreeOffset = IS_REFERENCED(m_frame) && m_param->rc.cuTree && !complexCheck;
4321
4322
23.5k
    if ((m_param->analysisMultiPassDistortion && m_param->rc.bStatRead) || (m_param->ctuDistortionRefine && strlen(m_param->analysisLoad)))
4323
0
    {
4324
0
        x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
4325
0
        if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
4326
0
            && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
4327
0
            qp += distortionData->offset[ctu.m_cuAddr];
4328
0
    }
4329
4330
23.5k
    if (m_param->analysisLoadReuseLevel >= 2 && m_param->rc.cuTree)
4331
0
    {
4332
0
        int cuIdx = (ctu.m_cuAddr * ctu.m_numPartitions) + cuGeom.absPartIdx;
4333
0
        if (ctu.m_slice->m_sliceType == I_SLICE)
4334
0
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_intra_data*)m_frame->m_analysisData.intraData)->cuQPOff[cuIdx]));
4335
0
        else
4336
0
            return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int32_t)(qp + 0.5 + ((x265_analysis_inter_data*)m_frame->m_analysisData.interData)->cuQPOff[cuIdx]));
4337
0
    }
4338
23.5k
    if (m_param->rc.hevcAq)
4339
0
    {
4340
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
4341
0
        double dQpOffset = 0;
4342
0
        if (bCuTreeOffset)
4343
0
        {
4344
0
            dQpOffset = cuTreeQPOffset(ctu, cuGeom);
4345
0
        }
4346
0
        else
4347
0
        {
4348
0
            dQpOffset = aqQPOffset(ctu, cuGeom);
4349
0
            if (complexCheck)
4350
0
            {
4351
0
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
4352
0
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
4353
0
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
4354
0
                return (offset < max_threshold);
4355
0
            }
4356
0
        }
4357
0
        qp += dQpOffset;
4358
0
    }
4359
23.5k
    else
4360
23.5k
    {
4361
23.5k
        int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
4362
        /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
4363
23.5k
        double *qpoffs = bCuTreeOffset ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
4364
23.5k
        if (qpoffs)
4365
23.5k
        {
4366
23.5k
            uint32_t width = m_frame->m_fencPic->m_picWidth;
4367
23.5k
            uint32_t height = m_frame->m_fencPic->m_picHeight;
4368
23.5k
            uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
4369
23.5k
            uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
4370
23.5k
            uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
4371
23.5k
            uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
4372
23.5k
            double dQpOffset = 0;
4373
23.5k
            uint32_t cnt = 0;
4374
68.6k
            for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
4375
45.1k
            {
4376
146k
                for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
4377
101k
                {
4378
101k
                    uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
4379
101k
                    dQpOffset += qpoffs[idx];
4380
101k
                    cnt++;
4381
101k
                }
4382
45.1k
            }
4383
23.5k
            dQpOffset /= cnt;
4384
23.5k
            qp += dQpOffset;
4385
23.5k
            if (complexCheck)
4386
0
            {
4387
0
                int32_t offset = (int32_t)(dQpOffset * 100 + .5);
4388
0
                double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
4389
0
                int32_t max_threshold = (int32_t)(threshold * 100 + .5);
4390
0
                return (offset < max_threshold);
4391
0
            }
4392
23.5k
        }
4393
23.5k
    }
4394
4395
23.5k
    return x265_clip3(m_param->rc.qpMin, m_param->rc.qpMax, (int)(qp + 0.5));
4396
23.5k
}
4397
4398
void Analysis::normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype)
4399
0
{
4400
0
    static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
4401
0
    static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
4402
0
    int shift = (X265_DEPTH - 8);
4403
4404
0
    double s = 1 + 0.005 * qp;
4405
4406
    // Calculate denominator of normalization factor
4407
0
    uint64_t fDc_den = 0, fAc_den = 0;
4408
4409
    // 1. Calculate dc component
4410
0
    uint64_t z_o = 0;
4411
0
    for (uint32_t block_yy = 0; block_yy < blockSize; block_yy += 4)
4412
0
    {
4413
0
        for (uint32_t block_xx = 0; block_xx < blockSize; block_xx += 4)
4414
0
        {
4415
0
            uint32_t temp = src[block_yy * blockSize + block_xx] >> shift;
4416
0
            z_o += temp * temp; // 2 * (Z(0)) pow(2)
4417
0
        }
4418
0
    }
4419
0
    fDc_den = (2 * z_o)  + (blockSize * blockSize * ssim_c1); // 2 * (Z(0)) pow(2) + N * C1
4420
0
    fDc_den /= ((blockSize >> 2) * (blockSize >> 2));
4421
4422
    // 2. Calculate ac component
4423
0
    uint64_t z_k = 0;
4424
0
    int block = (int)(((log(blockSize) / log(2)) - 2) + 0.5);
4425
0
    primitives.cu[block].normFact(src, blockSize, shift, &z_k);
4426
4427
    // Remove the DC part
4428
0
    z_k -= z_o;
4429
4430
0
    fAc_den = z_k + int(s * z_k) + ssim_c2;
4431
0
    fAc_den /= ((blockSize >> 2) * (blockSize >> 2));
4432
4433
0
    ctu.m_fAc_den[ttype] = fAc_den;
4434
0
    ctu.m_fDc_den[ttype] = fDc_den;
4435
0
}
4436
4437
void Analysis::calculateNormFactor(CUData& ctu, int qp)
4438
0
{
4439
0
    const pixel* srcY = m_modeDepth[0].fencYuv.m_buf[0];
4440
0
    uint32_t blockSize = m_modeDepth[0].fencYuv.m_size;
4441
4442
0
    normFactor(srcY, blockSize, ctu, qp, TEXT_LUMA);
4443
4444
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
4445
0
    {
4446
0
        const pixel* srcU = m_modeDepth[0].fencYuv.m_buf[1];
4447
0
        const pixel* srcV = m_modeDepth[0].fencYuv.m_buf[2];
4448
0
        uint32_t blockSizeC = m_modeDepth[0].fencYuv.m_csize;
4449
4450
0
        normFactor(srcU, blockSizeC, ctu, qp, TEXT_CHROMA_U);
4451
0
        normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V);
4452
0
    }
4453
0
}
4454
4455
int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom)
4456
0
{
4457
0
    int sameContentRef = 0;
4458
0
    int m_curPoc = parentCTU.m_slice->m_poc;
4459
0
    int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx];
4460
0
    int numPredDir = m_slice->isInterP() ? 1 : 2;
4461
0
    for (int list = 0; list < numPredDir; list++)
4462
0
    {
4463
0
        for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
4464
0
        {
4465
0
            int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc;
4466
#if ENABLE_SCC_EXT
4467
            if (refPoc == m_curPoc)
4468
                continue;
4469
#endif
4470
0
            int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx];
4471
0
            if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE)))
4472
0
                sameContentRef++;    /* Content changed */
4473
0
        }
4474
0
    }
4475
0
    return sameContentRef;
4476
0
}