Coverage Report

Created: 2026-03-08 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/search.cpp
Line
Count
Source
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Steve Borho <steve@borho.org>
5
*          Min Chen <chenm003@163.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 2 of the License, or
10
* (at your option) any later version.
11
*
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
* GNU General Public License for more details.
16
*
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
*
21
* This program is also available under a commercial proprietary license.
22
* For more information, contact us at license @ x265.com.
23
*****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "picyuv.h"
28
#include "cudata.h"
29
30
#include "search.h"
31
#include "entropy.h"
32
#include "rdcost.h"
33
34
#include "analysis.h"  // TLD
35
#include "framedata.h"
36
#include "encoder.h"
37
38
using namespace X265_NS;
39
40
#if _MSC_VER
41
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
42
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
43
#pragma warning(disable: 4127) // conditional expression is constant
44
#endif
45
46
0
#define MVP_IDX_BITS 1
47
48
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
49
50
Search::Search()
51
21.2k
{
52
21.2k
    memset(m_rqt, 0, sizeof(m_rqt));
53
54
85.1k
    for (int i = 0; i < 3; i++)
55
63.8k
    {
56
63.8k
        m_qtTempTransformSkipFlag[i] = NULL;
57
63.8k
        m_qtTempCbf[i] = NULL;
58
63.8k
    }
59
60
21.2k
    m_numLayers = 0;
61
21.2k
    m_intraPred = NULL;
62
21.2k
    m_intraPredAngs = NULL;
63
21.2k
    m_fencScaled = NULL;
64
21.2k
    m_fencTransposed = NULL;
65
21.2k
    m_tsCoeff = NULL;
66
21.2k
    m_tsResidual = NULL;
67
21.2k
    m_tsRecon = NULL;
68
21.2k
    m_param = NULL;
69
21.2k
    m_slice = NULL;
70
21.2k
    m_frame = NULL;
71
21.2k
    m_maxTUDepth = -1;
72
21.2k
}
73
74
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
75
21.2k
{
76
21.2k
    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
77
21.2k
    m_param = &param;
78
21.2k
    m_bFrameParallel = param.frameNumThreads > 1;
79
21.2k
    m_numLayers = g_log2Size[param.maxCUSize] - 2;
80
#if ENABLE_SCC_EXT
81
    m_ibcEnabled = param.bEnableSCC;
82
#endif
83
84
21.2k
    m_rdCost.setPsyRdScale(param.psyRd);
85
21.2k
    m_rdCost.setSsimRd(param.bSsimRd);
86
21.2k
    m_me.init(param.internalCsp);
87
88
21.2k
    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
89
21.2k
    if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
90
0
        ok &= m_quant.allocNoiseReduction(param);
91
92
21.2k
    ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
93
94
    /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
95
     * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
96
21.2k
    m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
97
98
21.2k
    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
99
21.2k
    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
100
21.2k
    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
101
102
21.2k
    m_limitTU = 0;
103
21.2k
    if (m_param->limitTU)
104
0
    {
105
0
        if (m_param->limitTU == 1)
106
0
            m_limitTU = X265_TU_LIMIT_BFS;
107
0
        else if (m_param->limitTU == 2)
108
0
            m_limitTU = X265_TU_LIMIT_DFS;
109
0
        else if (m_param->limitTU == 3)
110
0
            m_limitTU = X265_TU_LIMIT_NEIGH;
111
0
        else if (m_param->limitTU == 4)
112
0
            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
113
0
    }
114
115
    /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
116
     * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
117
     * which are reconstructed at each depth are valid. At the end, the transform depth table
118
     * is walked and the coeff and recon at the correct depths are collected */
119
120
21.2k
    if (param.internalCsp != X265_CSP_I400)
121
21.2k
    {
122
115k
        for (uint32_t i = 0; i <= m_numLayers; i++)
123
94.0k
        {
124
94.0k
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
125
94.0k
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
126
94.0k
            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
127
94.0k
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
128
94.0k
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
129
94.0k
        }
130
21.2k
    }
131
0
    else
132
0
    {
133
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
134
0
        {
135
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
136
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
137
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
138
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
139
0
        }
140
0
    }
141
142
    /* the rest of these buffers are indexed per-depth */
143
94.0k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
144
72.7k
    {
145
72.7k
        int cuSize = param.maxCUSize >> i;
146
72.7k
        ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
147
72.7k
        ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
148
72.7k
        ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
149
72.7k
        ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
150
72.7k
    }
151
152
21.2k
    if (param.internalCsp != X265_CSP_I400)
153
21.2k
    {
154
21.2k
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
155
21.2k
        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
156
21.2k
        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
157
21.2k
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
158
21.2k
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
159
21.2k
        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
160
21.2k
    }
161
0
    else
162
0
    {
163
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
164
0
        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
165
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
166
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
167
0
    }
168
169
21.2k
    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
170
21.2k
    m_fencScaled = m_intraPred + 32 * 32;
171
21.2k
    m_fencTransposed = m_fencScaled + 32 * 32;
172
21.2k
    m_intraPredAngs = m_fencTransposed + 32 * 32;
173
174
21.2k
    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
175
21.2k
    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
176
21.2k
    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
177
178
#if ENABLE_SCC_EXT
179
    m_numBVs = 0;
180
    m_numBV16s = 0;
181
#endif
182
183
21.2k
    return ok;
184
185
0
fail:
186
0
    return false;
187
21.2k
}
188
189
Search::~Search()
190
21.2k
{
191
115k
    for (uint32_t i = 0; i <= m_numLayers; i++)
192
94.0k
    {
193
94.0k
        X265_FREE(m_rqt[i].coeffRQT[0]);
194
94.0k
        m_rqt[i].reconQtYuv.destroy();
195
94.0k
        m_rqt[i].resiQtYuv.destroy();
196
94.0k
    }
197
198
94.0k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
199
72.7k
    {
200
72.7k
        m_rqt[i].tmpResiYuv.destroy();
201
72.7k
        m_rqt[i].tmpPredYuv.destroy();
202
72.7k
        m_rqt[i].bidirPredYuv[0].destroy();
203
72.7k
        m_rqt[i].bidirPredYuv[1].destroy();
204
72.7k
    }
205
206
21.2k
    X265_FREE(m_qtTempCbf[0]);
207
21.2k
    X265_FREE(m_qtTempTransformSkipFlag[0]);
208
21.2k
    X265_FREE(m_intraPred);
209
21.2k
    X265_FREE(m_tsCoeff);
210
21.2k
    X265_FREE(m_tsResidual);
211
21.2k
    X265_FREE(m_tsRecon);
212
21.2k
}
213
214
int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
215
26.4k
{
216
26.4k
    X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
217
218
26.4k
    m_me.setQP(qp);
219
26.4k
    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
220
221
26.4k
    int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
222
26.4k
    m_quant.setQPforQuant(ctu, quantQP);
223
26.4k
    return quantQP;
224
26.4k
}
225
226
void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData& cu, PicYuv* fencPic, int puOffset, PartSize part, int areaIdx, int finalIdx, bool isMVP , const int* neighborIdx)
227
0
{
228
#ifdef DETAILED_CU_STATS
229
    m_stats[cu.m_encData->m_frameEncoderID].countMotionEstimate++;
230
#endif
231
232
0
    int satdCost = 0;
233
0
    int numPredDir = slice->isInterP() ? 1 : 2;
234
0
    int searchRange = isMVP ? 32 : m_param->searchRange;
235
236
0
    MV mvp(0,0);
237
0
    MV mvzero(0,0);
238
239
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
240
0
    MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
241
242
0
    MotionData bestME[2];
243
0
    bestME[0].cost = MAX_UINT;
244
0
    bestME[1].cost = MAX_UINT;
245
246
0
    int numPart = cu.getNumPartInter(0);
247
0
    uint32_t lastMode = 0;
248
249
0
    int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
250
0
    int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
251
252
0
    int numMvc = 0;
253
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
254
0
    {
255
0
        PredictionUnit pu(cu, cuGeom, puIdx);
256
257
0
        int pos = finalIdx + puIdx * puOffset;
258
0
        int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;
259
260
0
        InterNeighbourMV neighbours[6];
261
0
        if(!isMVP)
262
0
           cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, neighbours);
263
264
0
        for (int list = 0; list < numPredDir; list++)
265
0
        {
266
0
            int numIdx = slice->m_numRefIdx[list];
267
0
            for (int ref = 0; ref < numIdx; ref++)
268
0
            {
269
0
                getBlkBits(part, slice->isInterP(), puIdx, lastMode, m_listSelBits);
270
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
271
0
                bits += getTUBits(ref, numIdx);
272
273
0
                MV mvmin, mvmax, outmv,mvp_lowres;;
274
0
                mvp = !isMVP ? m_areaBestMV[areaIdx][list][ref] : mvp;
275
276
0
                MV zeroMV[2] = {0,0};
277
0
                const MV* amvp = zeroMV;
278
0
                int mvpIdx = 0;
279
280
0
                bool bLowresMVP = false;
281
0
                if (!isMVP)
282
0
                {
283
0
                    for(int dir = MD_LEFT; dir <= MD_ABOVE_LEFT ; dir++)
284
0
                    {
285
0
                        int neighIdx = neighborIdx[dir];
286
0
                        if (neighIdx >= 0)
287
0
                        {
288
0
                            MEData& neighborData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + neighIdx];
289
0
                            for (int i = 0; i < 2; i++)
290
0
                            {
291
0
                                neighbours[dir].mv[i] = neighborData.mv[i];
292
0
                                neighbours[dir].refIdx[i] = neighborData.ref[i];
293
0
                            }
294
0
                            neighbours[dir].isAvailable = (neighborData.ref[0] >= 0 || neighborData.ref[1] >= 0);
295
0
                        }
296
0
                        else
297
0
                        {
298
0
                            for (int i = 0; i < 2; i++)
299
0
                                neighbours[dir].refIdx[i] = -1;
300
0
                            neighbours[dir].isAvailable = false;
301
0
                        }
302
0
                    }
303
304
0
                    numMvc = cu.getPMV(neighbours, list, ref, amvpCand[list][ref], mvc);
305
0
                    if (numMvc > 0)
306
0
                    {
307
0
                        amvp = amvpCand[list][ref];
308
0
                        mvpIdx = selectMVP(cu, pu, amvp, list, ref);
309
0
                        mvp = amvp[mvpIdx];                 
310
0
                    }
311
0
                    else if (slice->m_refFrameList[list][ref]->m_encData->m_slice->m_sliceType != I_SLICE)
312
0
                    {
313
0
                        MEData meData = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
314
315
0
                        bool bi = (meData.ref[0] >= 0 && meData.ref[1] >= 0);
316
0
                        bool uniL0 = (meData.ref[0] >= 0 && meData.ref[1] == REF_NOT_VALID);
317
0
                        bool uniL1 = (meData.ref[1] >= 0 && meData.ref[0] == REF_NOT_VALID);
318
319
0
                        if (uniL0)
320
0
                            mvp = meData.mv[0];
321
0
                        else if (uniL1)
322
0
                            mvp = meData.mv[1];
323
0
                        else if (bi)
324
0
                            mvp = meData.mv[list];
325
0
                    }
326
0
                }
327
328
0
                m_me.setMVP(mvp);
329
330
0
                if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad))
331
0
                {
332
0
                    uint32_t blockX = cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + (pu.width  >> 1);
333
0
                    uint32_t blockY = cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + (pu.height >> 1);
334
335
0
                    if (blockX < m_slice->m_sps->picWidthInLumaSamples && blockY < m_slice->m_sps->picHeightInLumaSamples)
336
0
                    {
337
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
338
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
339
0
                        if (lmv.notZero() && !layer)
340
0
                        {
341
0
                            mvc[numMvc++] = lmv;
342
0
                            bLowresMVP = true;
343
0
                        }
344
0
                        mvp_lowres = lmv;
345
0
                    }
346
0
                }
347
348
0
                PicYuv* recon = slice->m_mref[list][ref].reconPic;
349
0
                int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0);
350
351
0
                m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
352
0
                setSearchRange(cu, mvp, searchRange, mvmin, mvmax);
353
354
0
                if (isMVP)
355
0
                {
356
0
                    satdCost = m_me.diamondSearch(&slice->m_mref[list][ref], mvmin, mvmax, outmv);
357
0
                    m_areaBestMV[areaIdx][list][ref] = outmv;
358
0
                }
359
0
                else
360
0
                {
361
0
                    m_vertRestriction = slice->m_refPOCList[list][ref] == slice->m_poc;
362
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
363
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
364
365
0
                    if (bLowresMVP && mvp_lowres.notZero() && mvp_lowres != mvp)
366
0
                    {
367
0
                        MV outmv_lowres;
368
0
                        bLowresMVP = false;
369
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
370
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref],  mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange,outmv_lowres, m_param->maxSlices,
371
0
                            m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0): 0);
372
373
0
                        if (lowresMvCost < satdCost)
374
0
                        {
375
0
                            outmv = outmv_lowres;
376
0
                            satdCost = lowresMvCost;
377
0
                            bLowresMVP = true;
378
0
                        }
379
0
                    }
380
0
                }
381
382
0
                bits += m_me.bitcost(outmv);
383
0
                uint32_t mvCost = m_me.mvcost(outmv);
384
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
385
386
0
                if(!isMVP)
387
0
                {
388
0
                    if (bLowresMVP)
389
0
                        updateMVP(mvp, outmv, bits, cost, mvp_lowres);
390
391
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
392
0
                }
393
0
                if (cost < bestME[list].cost)
394
0
                {
395
0
                    bestME[list].mv = outmv;
396
0
                    bestME[list].mvp = mvp;
397
0
                    bestME[list].mvpIdx = 0;
398
0
                    bestME[list].cost = cost;
399
0
                    bestME[list].bits = bits;
400
0
                    bestME[list].mvCost = mvCost;
401
0
                    bestME[list].ref = ref;
402
0
                }
403
0
            }
404
0
        }
405
406
0
        if (isMVP)
407
0
            return;
408
409
        //Bi-Direction
410
0
        MotionData bidir[2];
411
0
        uint32_t bidirCost = MAX_UINT;
412
0
        int bidirBits = 0;
413
0
        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
414
415
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&
416
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && !isMVP)
417
0
        {
418
0
            bidir[0] = bestME[0];
419
0
            bidir[1] = bestME[1];
420
421
0
            if (m_me.bChromaSATD)
422
0
            {
423
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
424
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
425
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
426
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
427
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
428
429
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
430
0
                    m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
431
0
            }
432
0
            else
433
0
            {
434
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
435
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
436
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
437
438
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
439
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
440
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
441
0
                    bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
442
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
443
0
            }
444
445
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
446
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
447
448
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
449
0
            if (bTryZero)
450
0
            {
451
0
                MV mvmin, mvmax;
452
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
453
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
454
0
                mvmax.y += 2;
455
0
                mvmin <<= 2;
456
0
                mvmax <<= 2;
457
458
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
459
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
460
0
            }
461
0
            if (bTryZero)
462
0
            {
463
0
                if (m_me.bChromaSATD)
464
0
                {
465
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
466
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
467
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
468
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
469
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
470
471
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
472
0
                        m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
473
0
                }
474
0
                else
475
0
                {
476
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
477
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
478
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
479
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
480
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
481
0
                }
482
483
0
                MV mvp0 = bestME[0].mvp;
484
0
                int mvpIdx0 = bestME[0].mvpIdx;
485
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
486
487
0
                MV mvp1 = bestME[1].mvp;
488
0
                int mvpIdx1 = bestME[1].mvpIdx;
489
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
490
491
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
492
493
0
                if (cost < bidirCost)
494
0
                {
495
0
                    bidir[0].mv = mvzero;
496
0
                    bidir[1].mv = mvzero;
497
0
                    bidir[0].mvp = mvp0;
498
0
                    bidir[1].mvp = mvp1;
499
0
                    bidir[0].mvpIdx = mvpIdx0;
500
0
                    bidir[1].mvpIdx = mvpIdx1;
501
0
                    bidirCost = cost;
502
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
503
0
                }
504
0
            }
505
0
        }
506
0
        MEData& outME = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
507
508
0
        outME.ref[0] = REF_NOT_VALID;
509
0
        outME.ref[1] = REF_NOT_VALID;
510
511
0
        if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
512
0
        {
513
0
            lastMode = 2;
514
515
0
            outME.mv[0] = bidir[0].mv;
516
0
            outME.mv[1] = bidir[1].mv;
517
0
            outME.mvp[0] = bidir[0].mvp;
518
0
            outME.mvp[1] = bidir[1].mvp;
519
0
            outME.mvCost[0] = bestME[0].mvCost;
520
0
            outME.mvCost[1] = bestME[1].mvCost;
521
0
            outME.ref[0] = bestME[0].ref;
522
0
            outME.ref[1] = bestME[1].ref;
523
524
0
            outME.bits = bidirBits;
525
0
            outME.cost = bidirCost;
526
0
        }
527
0
        else if (bestME[0].cost <= bestME[1].cost)
528
0
        {
529
0
            lastMode = 0;
530
531
0
            outME.mv[0] = bestME[0].mv;
532
0
            outME.mvp[0] = bestME[0].mvp;
533
0
            outME.mvCost[0] = bestME[0].mvCost;
534
0
            outME.cost = bestME[0].cost;
535
0
            outME.bits = bestME[0].bits;
536
0
            outME.ref[0] = bestME[0].ref;
537
0
            outME.ref[1] = REF_NOT_VALID;
538
0
        }
539
0
        else
540
0
        {
541
0
            lastMode = 1;
542
543
0
            outME.mv[1] = bestME[1].mv;
544
0
            outME.mvp[1] = bestME[1].mvp;
545
0
            outME.mvCost[1] = bestME[1].mvCost;
546
0
            outME.cost = bestME[1].cost;
547
0
            outME.bits = bestME[1].bits;
548
0
            outME.ref[1] = bestME[1].ref;
549
0
            outME.ref[0] = REF_NOT_VALID;
550
0
        }
551
0
    }
552
0
}
553
554
#if CHECKED_BUILD || _DEBUG
555
void Search::invalidateContexts(int fromDepth)
556
{
557
    /* catch reads without previous writes */
558
    for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
559
    {
560
        m_rqt[d].cur.markInvalid();
561
        m_rqt[d].rqtTemp.markInvalid();
562
        m_rqt[d].rqtRoot.markInvalid();
563
        m_rqt[d].rqtTest.markInvalid();
564
    }
565
}
566
#else
567
102k
void Search::invalidateContexts(int) {}
568
#endif
569
570
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
571
9.38M
{
572
9.38M
    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
573
9.38M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
574
575
9.38M
    if (!(log2TrSize - m_hChromaShift < 2))
576
3.41M
    {
577
3.41M
        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
578
3.41M
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
579
3.41M
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
580
3.41M
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
581
3.41M
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
582
3.41M
    }
583
584
9.38M
    if (subdiv)
585
1.49M
    {
586
1.49M
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
587
7.46M
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
588
5.97M
            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
589
1.49M
    }
590
9.38M
}
591
592
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
593
6.86M
{
594
6.86M
    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
595
6.80M
        return;
596
597
61.8k
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
598
599
61.8k
    if (tuDepth < cu.m_tuDepth[absPartIdx])
600
11.3k
    {
601
11.3k
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
602
56.8k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
603
45.4k
            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
604
605
11.3k
        return;
606
11.3k
    }
607
608
50.4k
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
609
610
50.4k
    if (log2TrSizeC < 2)
611
33.0k
    {
612
33.0k
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
613
33.0k
        if (absPartIdx & 3)
614
24.7k
            return;
615
8.26k
        log2TrSizeC = 2;
616
8.26k
    }
617
618
25.6k
    uint32_t qtLayer = log2TrSize - 2;
619
620
25.6k
    if (m_csp != X265_CSP_I422)
621
25.5k
    {
622
25.5k
        uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
623
25.5k
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
624
25.5k
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
625
25.5k
        m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
626
25.5k
    }
627
129
    else
628
129
    {
629
129
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
630
129
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
631
129
        uint32_t subTUSize = 1 << (log2TrSizeC * 2);
632
129
        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
633
129
        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
634
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
635
129
        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
636
0
            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
637
129
    }
638
25.6k
}
639
640
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
641
4.69M
{
642
4.69M
    CUData& cu = mode.cu;
643
4.69M
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
644
4.69M
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
645
4.69M
    uint32_t qtLayer    = log2TrSize - 2;
646
4.69M
    uint32_t sizeIdx    = log2TrSize - 2;
647
4.69M
    bool mightNotSplit  = log2TrSize <= depthRange[1];
648
4.69M
    bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
649
4.69M
    bool bEnableRDOQ  = !!m_param->rdoqLevel;
650
651
    /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
652
4.69M
    if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
653
0
    {
654
0
        mightNotSplit = false;
655
0
        mightSplit = true;
656
0
    }
657
658
4.69M
    Cost fullCost;
659
4.69M
    uint32_t bCBF = 0;
660
661
4.69M
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
662
4.69M
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
663
664
4.69M
    if (mightNotSplit)
665
4.68M
    {
666
4.68M
        if (mightSplit)
667
384k
            m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
668
669
4.68M
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
670
4.68M
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
671
4.68M
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
672
4.68M
        uint32_t stride   = mode.fencYuv->m_size;
673
674
        // init availability pattern
675
4.68M
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
676
4.68M
        IntraNeighbors intraNeighbors;
677
4.68M
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
678
4.68M
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
679
680
        // get prediction signal
681
4.68M
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
682
683
4.68M
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
684
4.68M
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
685
686
4.68M
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
687
4.68M
        coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
688
689
        // store original entropy coding status
690
4.68M
        if (bEnableRDOQ)
691
4.70M
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
692
4.68M
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
693
694
4.68M
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
695
4.68M
        if (numSig)
696
23.8k
        {
697
23.8k
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
698
23.8k
            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
699
23.8k
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
700
23.8k
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
701
23.8k
            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
702
23.8k
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
703
23.8k
        }
704
4.66M
        else
705
            // no coded residual, recon = pred
706
4.66M
            primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
707
708
4.68M
        bCBF = !!numSig << tuDepth;
709
4.68M
        cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
710
4.68M
        fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
711
712
4.68M
        m_entropyCoder.resetBits();
713
4.68M
        if (!absPartIdx)
714
1.76M
        {
715
1.76M
            if (!cu.m_slice->isIntra())
716
0
            {
717
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
718
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
719
0
                m_entropyCoder.codeSkipFlag(cu, 0);
720
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
721
0
            }
722
723
1.76M
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
724
1.76M
        }
725
4.68M
        if (cu.m_partSize[0] == SIZE_2Nx2N)
726
2.31M
        {
727
2.31M
            if (!absPartIdx)
728
1.16M
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
729
2.31M
        }
730
2.37M
        else
731
2.37M
        {
732
2.37M
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
733
2.37M
            if (!tuDepth)
734
0
            {
735
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
736
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
737
0
            }
738
2.37M
            else if (!(absPartIdx & (qNumParts - 1)))
739
2.39M
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
740
2.37M
        }
741
4.68M
        if (log2TrSize != depthRange[0])
742
781k
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
743
744
4.68M
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
745
746
4.68M
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
747
23.8k
            m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
748
749
4.68M
        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
750
751
4.68M
        if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
752
0
            fullCost.bits *= 4;
753
754
4.68M
        if (m_rdCost.m_psyRd)
755
4.68M
        {
756
4.68M
            fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
757
4.68M
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
758
4.68M
        }
759
5.12k
        else if(m_rdCost.m_ssimRd)
760
0
        {
761
0
            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
762
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
763
0
        }
764
5.12k
        else
765
5.12k
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
766
4.68M
    }
767
6.45k
    else
768
6.45k
        fullCost.rdcost = MAX_INT64;
769
770
4.69M
    if (mightSplit)
771
385k
    {
772
385k
        if (mightNotSplit)
773
385k
        {
774
385k
            m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
775
385k
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
776
385k
        }
777
778
        /* code split block */
779
385k
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
780
781
385k
        int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
782
385k
        if (m_param->bEnableTSkipFast)
783
0
            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
784
785
385k
        Cost splitCost;
786
385k
        uint32_t cbf = 0;
787
1.92M
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
788
1.53M
        {
789
1.53M
            if (checkTransformSkip)
790
0
                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
791
1.53M
            else
792
1.53M
                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
793
794
1.53M
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
795
1.53M
        }
796
385k
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
797
798
385k
        if (mightNotSplit && log2TrSize != depthRange[0])
799
385k
        {
800
            /* If we could have coded this TU depth, include cost of subdiv flag */
801
385k
            m_entropyCoder.resetBits();
802
385k
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
803
385k
            splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
804
805
385k
            if (m_rdCost.m_psyRd)
806
385k
                splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
807
9
            else if(m_rdCost.m_ssimRd)
808
0
                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
809
9
            else
810
9
                splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
811
385k
        }
812
813
385k
        if (splitCost.rdcost < fullCost.rdcost)
814
475
        {
815
475
            outCost.rdcost     += splitCost.rdcost;
816
475
            outCost.distortion += splitCost.distortion;
817
475
            outCost.bits       += splitCost.bits;
818
475
            outCost.energy     += splitCost.energy;
819
475
            return;
820
475
        }
821
384k
        else
822
384k
        {
823
            // recover entropy state of full-size TU encode
824
384k
            m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
825
826
            // recover transform index and Cbf values
827
384k
            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
828
384k
            cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
829
384k
            cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
830
384k
        }
831
385k
    }
832
833
    // set reconstruction for next intra prediction blocks if full TU prediction won
834
4.69M
    PicYuv*  reconPic = m_frame->m_reconPic[0];
835
4.69M
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
836
4.69M
    intptr_t picStride = reconPic->m_stride;
837
4.69M
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
838
839
4.69M
    outCost.rdcost     += fullCost.rdcost;
840
4.69M
    outCost.distortion += fullCost.distortion;
841
4.69M
    outCost.bits       += fullCost.bits;
842
4.69M
    outCost.energy     += fullCost.energy;
843
4.69M
}
844
845
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
846
0
{
847
0
    uint32_t fullDepth = cuGeom.depth + tuDepth;
848
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
849
0
    uint32_t tuSize = 1 << log2TrSize;
850
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
851
852
0
    X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
853
854
0
    CUData& cu = mode.cu;
855
0
    Yuv* predYuv = &mode.predYuv;
856
0
    const Yuv* fencYuv = mode.fencYuv;
857
858
0
    Cost fullCost;
859
0
    fullCost.rdcost = MAX_INT64;
860
0
    int      bTSkip = 0;
861
0
    uint32_t bCBF = 0;
862
863
0
    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
864
0
    pixel*   pred = predYuv->getLumaAddr(absPartIdx);
865
0
    int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
866
0
    uint32_t stride = fencYuv->m_size;
867
0
    uint32_t sizeIdx = log2TrSize - 2;
868
869
    // init availability pattern
870
0
    uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
871
0
    IntraNeighbors intraNeighbors;
872
0
    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
873
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
874
875
    // get prediction signal
876
0
    predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
877
878
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
879
880
0
    uint32_t qtLayer = log2TrSize - 2;
881
0
    uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
882
0
    coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
883
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
884
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
885
886
    // store original entropy coding status
887
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
888
889
0
    if (bEnableRDOQ)
890
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
891
892
0
    int checkTransformSkip = 1;
893
0
    for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
894
0
    {
895
0
        uint64_t tmpCost;
896
0
        uint32_t tmpEnergy = 0;
897
898
0
        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
899
0
        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
900
0
        bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
901
0
        uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
902
903
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
904
905
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
906
0
        if (numSig)
907
0
        {
908
0
            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
909
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
910
0
            bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
911
0
            bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
912
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
913
0
        }
914
0
        else if (useTSkip)
915
0
        {
916
            /* do not allow tskip if CBF=0, pretend we did not try tskip */
917
0
            checkTransformSkip = 0;
918
0
            break;
919
0
        }
920
0
        else
921
            // no residual coded, recon = pred
922
0
            primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
923
924
0
        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
925
926
0
        cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
927
0
        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
928
929
0
        if (useTSkip)
930
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
931
932
0
        m_entropyCoder.resetBits();
933
0
        if (!absPartIdx)
934
0
        {
935
0
            if (!cu.m_slice->isIntra())
936
0
            {
937
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
938
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
939
0
                m_entropyCoder.codeSkipFlag(cu, 0);
940
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
941
0
            }
942
943
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
944
0
        }
945
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
946
0
        {
947
0
            if (!absPartIdx)
948
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
949
0
        }
950
0
        else
951
0
        {
952
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
953
0
            if (!tuDepth)
954
0
            {
955
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
956
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
957
0
            }
958
0
            else if (!(absPartIdx & (qNumParts - 1)))
959
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
960
0
        }
961
0
        m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
962
963
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
964
965
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
966
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
967
968
0
        uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
969
970
0
        if (!useTSkip)
971
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
972
973
0
        if (m_rdCost.m_psyRd)
974
0
        {
975
0
            tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
976
0
            tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
977
0
        }
978
0
        else if(m_rdCost.m_ssimRd)
979
0
        {
980
0
            tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx);
981
0
            tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
982
0
        }
983
0
        else
984
0
            tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
985
986
0
        if (tmpCost < fullCost.rdcost)
987
0
        {
988
0
            bTSkip = useTSkip;
989
0
            bCBF = !!numSig;
990
0
            fullCost.rdcost = tmpCost;
991
0
            fullCost.distortion = tmpDist;
992
0
            fullCost.bits = tmpBits;
993
0
            fullCost.energy = tmpEnergy;
994
0
        }
995
0
    }
996
997
0
    if (bTSkip)
998
0
    {
999
0
        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
1000
0
        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
1001
0
    }
1002
0
    else if (checkTransformSkip)
1003
0
    {
1004
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
1005
0
        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
1006
0
        m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
1007
0
    }
1008
1009
    // set reconstruction for next intra prediction blocks
1010
0
    PicYuv*  reconPic = m_frame->m_reconPic[0];
1011
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1012
0
    intptr_t picStride = reconPic->m_stride;
1013
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
1014
1015
0
    outCost.rdcost += fullCost.rdcost;
1016
0
    outCost.distortion += fullCost.distortion;
1017
0
    outCost.bits += fullCost.bits;
1018
0
    outCost.energy += fullCost.energy;
1019
0
}
1020
1021
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
1022
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
1023
0
{
1024
0
    CUData& cu = mode.cu;
1025
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
1026
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1027
0
    bool     bCheckFull = log2TrSize <= depthRange[1];
1028
1029
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
1030
1031
    /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
1032
     * since we are not measuring RD cost */
1033
0
    if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
1034
0
        bCheckFull = false;
1035
1036
0
    if (bCheckFull)
1037
0
    {
1038
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
1039
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
1040
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
1041
0
        uint32_t stride   = mode.fencYuv->m_size;
1042
1043
        // init availability pattern
1044
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
1045
0
        IntraNeighbors intraNeighbors;
1046
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
1047
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
1048
1049
        // get prediction signal
1050
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
1051
1052
0
        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
1053
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
1054
1055
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
1056
0
        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
1057
1058
0
        uint32_t sizeIdx   = log2TrSize - 2;
1059
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1060
1061
0
        PicYuv*  reconPic = m_frame->m_reconPic[0];
1062
0
        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1063
0
        intptr_t picStride = reconPic->m_stride;
1064
1065
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
1066
0
        if (numSig)
1067
0
        {
1068
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
1069
0
            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
1070
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
1071
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
1072
0
            bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
1073
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
1074
0
            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
1075
0
        }
1076
0
        else
1077
0
        {
1078
0
            primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
1079
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
1080
0
        }
1081
0
    }
1082
0
    else
1083
0
    {
1084
0
        X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
1085
1086
        /* code split block */
1087
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1088
0
        uint32_t cbf = 0;
1089
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1090
0
        {
1091
0
            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
1092
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
1093
0
        }
1094
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
1095
0
    }
1096
0
}
1097
1098
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
1099
1.58M
{
1100
1.58M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1101
1102
1.58M
    if (tuDepth == cu.m_tuDepth[absPartIdx])
1103
1.58M
    {
1104
1.58M
        uint32_t qtLayer    = log2TrSize - 2;
1105
1106
        // copy transform coefficients
1107
1.58M
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
1108
1.58M
        coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
1109
1.58M
        coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
1110
1.58M
        memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
1111
1112
        // copy reconstruction
1113
1.58M
        m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
1114
1.58M
    }
1115
462
    else
1116
462
    {
1117
462
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1118
2.36k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1119
1.90k
            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
1120
462
    }
1121
1.58M
}
1122
1123
inline void offsetCBFs(uint8_t subTUCBF[2])
1124
0
{
1125
0
    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
1126
0
    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
1127
0
    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
1128
0
}
1129
1130
/* 4:2:2 post-TU split processing */
1131
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
1132
0
{
1133
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1134
1135
0
    if (log2TrSize == 2)
1136
0
    {
1137
0
        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1138
0
        ++log2TrSize;
1139
0
    }
1140
1141
0
    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
1142
1143
    // move the CBFs down a level and set the parent CBF
1144
0
    uint8_t subTUCBF[2];
1145
0
    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
1146
0
    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
1147
0
    offsetCBFs(subTUCBF);
1148
1149
0
    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
1150
0
    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
1151
0
}
1152
1153
/* returns distortion */
1154
void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
1155
9.34M
{
1156
9.34M
    CUData& cu = mode.cu;
1157
9.34M
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1158
9.34M
    bool bEnableRDOQ = !!m_param->rdoqLevel;
1159
1160
9.34M
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1161
1.49M
    {
1162
1.49M
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1163
1.49M
        uint32_t splitCbfU = 0, splitCbfV = 0;
1164
7.45M
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1165
5.96M
        {
1166
5.96M
            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
1167
5.96M
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1168
5.96M
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1169
5.96M
        }
1170
1.49M
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1171
1.49M
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1172
1173
1.49M
        return;
1174
1.49M
    }
1175
1176
7.85M
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1177
7.85M
    uint32_t tuDepthC = tuDepth;
1178
7.85M
    if (log2TrSizeC < 2)
1179
5.95M
    {
1180
5.95M
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1181
5.95M
        if (absPartIdx & 3)
1182
4.47M
            return;
1183
1.48M
        log2TrSizeC = 2;
1184
1.48M
        tuDepthC--;
1185
1.48M
    }
1186
1187
3.37M
    if (bEnableRDOQ)
1188
3.41M
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
1189
1190
3.37M
    bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
1191
3.37M
    checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
1192
3.37M
    if (checkTransformSkip)
1193
0
    {
1194
0
        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
1195
0
        return;
1196
0
    }
1197
1198
3.37M
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1199
3.37M
    uint32_t qtLayer = log2TrSize - 2;
1200
3.37M
    uint32_t stride = mode.fencYuv->m_csize;
1201
3.37M
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1202
1203
3.37M
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1204
3.37M
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1205
1206
3.37M
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1207
3.37M
    do
1208
3.37M
    {
1209
3.37M
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1210
1211
3.37M
        IntraNeighbors intraNeighbors;
1212
3.37M
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1213
1214
10.1M
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1215
6.81M
        {
1216
6.81M
            TextType ttype = (TextType)chromaId;
1217
1218
6.81M
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1219
6.81M
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1220
6.81M
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1221
6.81M
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1222
6.81M
            coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1223
6.81M
            pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1224
6.81M
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
1225
6.81M
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1226
6.81M
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1227
6.81M
            intptr_t picStride = reconPic->m_strideC;
1228
1229
6.81M
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1230
6.81M
            if (chromaPredMode == DM_CHROMA_IDX)
1231
1.36M
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1232
6.81M
            if (m_csp == X265_CSP_I422)
1233
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1234
1235
            // init availability pattern
1236
6.81M
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1237
1238
            // get prediction signal
1239
6.81M
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1240
6.81M
            cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1241
1242
6.81M
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1243
1244
6.81M
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1245
6.81M
            if (numSig)
1246
25.5k
            {
1247
25.5k
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1248
25.5k
                bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1249
25.5k
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1250
25.5k
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1251
25.5k
                bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
1252
25.5k
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
1253
25.5k
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1254
25.5k
            }
1255
6.78M
            else
1256
6.78M
            {
1257
                // no coded residual, recon = pred
1258
6.78M
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
1259
6.78M
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1260
6.78M
            }
1261
1262
6.81M
            outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
1263
1264
6.81M
            if (m_rdCost.m_psyRd)
1265
6.82M
                outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1266
18.4E
            else if(m_rdCost.m_ssimRd)
1267
0
                outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1268
1269
6.81M
            primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
1270
6.81M
        }
1271
3.37M
    }
1272
3.37M
    while (tuIterator.isNextSection());
1273
1274
3.37M
    if (splitType == VERTICAL_SPLIT)
1275
0
    {
1276
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1277
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1278
0
    }
1279
3.37M
}
1280
1281
/* returns distortion */
1282
void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
1283
0
{
1284
0
    CUData& cu = mode.cu;
1285
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
1286
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1287
0
    const uint32_t log2TrSizeC = 2;
1288
0
    uint32_t qtLayer = log2TrSize - 2;
1289
1290
    /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
1291
     * so the entropy coder is not very accurate. The best we can do is return it in the same
1292
     * condition as it arrived, and to do all bit estimates from the same state. */
1293
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
1294
1295
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1296
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1297
1298
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1299
0
    do
1300
0
    {
1301
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1302
1303
0
        IntraNeighbors intraNeighbors;
1304
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1305
1306
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1307
0
        {
1308
0
            TextType ttype = (TextType)chromaId;
1309
1310
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1311
0
            pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1312
0
            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
1313
0
            uint32_t stride = mode.fencYuv->m_csize;
1314
0
            const uint32_t sizeIdxC = log2TrSizeC - 2;
1315
1316
0
            uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1317
0
            coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1318
0
            pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1319
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
1320
1321
            // init availability pattern
1322
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1323
1324
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1325
0
            if (chromaPredMode == DM_CHROMA_IDX)
1326
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1327
0
            if (m_csp == X265_CSP_I422)
1328
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1329
1330
            // get prediction signal
1331
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1332
1333
0
            uint64_t bCost = MAX_INT64;
1334
0
            sse_t bDist = 0;
1335
0
            uint32_t bCbf = 0;
1336
0
            uint32_t bEnergy = 0;
1337
0
            int      bTSkip = 0;
1338
1339
0
            int checkTransformSkip = 1;
1340
0
            for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
1341
0
            {
1342
0
                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
1343
0
                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
1344
0
                uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
1345
1346
0
                primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1347
1348
0
                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
1349
0
                if (numSig)
1350
0
                {
1351
0
                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
1352
0
                    bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
1353
0
                    bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1354
0
                    bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1355
0
                    bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
1356
0
                    primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
1357
0
                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1358
0
                }
1359
0
                else if (useTSkip)
1360
0
                {
1361
0
                    checkTransformSkip = 0;
1362
0
                    break;
1363
0
                }
1364
0
                else
1365
0
                {
1366
0
                    primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
1367
0
                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1368
0
                }
1369
0
                sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
1370
0
                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
1371
1372
0
                cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1373
1374
0
                uint32_t tmpBits = 0, tmpEnergy = 0;
1375
0
                if (numSig)
1376
0
                {
1377
0
                    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1378
0
                    m_entropyCoder.resetBits();
1379
0
                    m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
1380
0
                    tmpBits = m_entropyCoder.getNumberOfWrittenBits();
1381
0
                }
1382
1383
0
                uint64_t tmpCost;
1384
0
                if (m_rdCost.m_psyRd)
1385
0
                {
1386
0
                    tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1387
0
                    tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
1388
0
                }
1389
0
                else if(m_rdCost.m_ssimRd)
1390
0
                {
1391
0
                    tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1392
0
                    tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
1393
0
                }
1394
0
                else
1395
0
                    tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
1396
1397
0
                if (tmpCost < bCost)
1398
0
                {
1399
0
                    bCost = tmpCost;
1400
0
                    bDist = tmpDist;
1401
0
                    bTSkip = useTSkip;
1402
0
                    bCbf = !!numSig;
1403
0
                    bEnergy = tmpEnergy;
1404
0
                }
1405
0
            }
1406
1407
0
            if (bTSkip)
1408
0
            {
1409
0
                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
1410
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
1411
0
            }
1412
1413
0
            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1414
0
            cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1415
1416
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1417
0
            pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1418
0
            intptr_t picStride = reconPic->m_strideC;
1419
0
            primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
1420
1421
0
            outCost.distortion += bDist;
1422
0
            outCost.energy += bEnergy;
1423
0
        }
1424
0
    }
1425
0
    while (tuIterator.isNextSection());
1426
1427
0
    if (splitType == VERTICAL_SPLIT)
1428
0
    {
1429
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1430
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1431
0
    }
1432
1433
0
    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1434
0
}
1435
1436
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
1437
1.05M
{
1438
1.05M
    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1439
1.05M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1440
1.05M
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1441
1442
1.05M
    if (tuDepthL == tuDepth || log2TrSizeC == 2)
1443
1.05M
    {
1444
        // copy transform coefficients
1445
1.05M
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1446
1.05M
        uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1447
1448
1.05M
        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
1449
1.05M
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1450
1.05M
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1451
1.05M
        coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1452
1.05M
        coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1453
1.05M
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1454
1.05M
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1455
1456
        // copy reconstruction
1457
1.05M
        m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1458
1.05M
    }
1459
310
    else
1460
310
    {
1461
310
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1462
1.55k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1463
1.24k
            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1464
310
    }
1465
1.05M
}
1466
1467
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
1468
0
{
1469
0
    CUData& cu = mode.cu;
1470
0
    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
1471
1472
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1473
0
    {
1474
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1475
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
1476
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1477
0
        {
1478
0
            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
1479
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1480
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1481
0
        }
1482
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1483
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1484
1485
0
        return;
1486
0
    }
1487
1488
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1489
0
    uint32_t tuDepthC = tuDepth;
1490
0
    if (log2TrSizeC < 2)
1491
0
    {
1492
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1493
0
        if (absPartIdx & 3)
1494
0
            return;
1495
0
        log2TrSizeC = 2;
1496
0
        tuDepthC--;
1497
0
    }
1498
1499
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1500
0
    uint32_t stride = mode.fencYuv->m_csize;
1501
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1502
1503
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1504
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1505
1506
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1507
0
    do
1508
0
    {
1509
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1510
1511
0
        IntraNeighbors intraNeighbors;
1512
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1513
1514
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1515
0
        {
1516
0
            TextType ttype = (TextType)chromaId;
1517
1518
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1519
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1520
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1521
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1522
0
            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
1523
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1524
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1525
0
            intptr_t picStride = reconPic->m_strideC;
1526
1527
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1528
0
            if (chromaPredMode == DM_CHROMA_IDX)
1529
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1530
0
            if (m_csp == X265_CSP_I422)
1531
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1532
1533
            // init availability pattern
1534
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1535
1536
            // get prediction signal
1537
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1538
1539
0
            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1540
1541
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1542
1543
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1544
0
            if (numSig)
1545
0
            {
1546
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1547
0
                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
1548
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1549
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
1550
0
                bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
1551
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
1552
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1553
0
            }
1554
0
            else
1555
0
            {
1556
                // no coded residual, recon = pred
1557
0
                primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
1558
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1559
0
            }
1560
0
        }
1561
0
    }
1562
0
    while (tuIterator.isNextSection());
1563
1564
0
    if (splitType == VERTICAL_SPLIT)
1565
0
    {
1566
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1567
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1568
0
    }
1569
0
}
1570
1571
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
1572
683k
{
1573
683k
    CUData& cu = intraMode.cu;
1574
1575
683k
    cu.setPartSizeSubParts(partSize);
1576
683k
    cu.setPredModeSubParts(MODE_INTRA);
1577
1578
683k
    uint32_t tuDepthRange[2];
1579
683k
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1580
1581
683k
    intraMode.initCosts();
1582
683k
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
1583
683k
    if (m_csp != X265_CSP_I400)
1584
683k
    {
1585
683k
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
1586
683k
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
1587
683k
    }
1588
18.4E
    else
1589
18.4E
        intraMode.distortion += intraMode.lumaDistortion;
1590
683k
    cu.m_distortion[0] = intraMode.distortion;
1591
683k
    m_entropyCoder.resetBits();
1592
683k
    if (m_slice->m_pps->bTransquantBypassEnabled)
1593
198k
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1594
1595
683k
    int skipFlagBits = 0;
1596
683k
    if (!m_slice->isIntra())
1597
0
    {
1598
0
        m_entropyCoder.codeSkipFlag(cu, 0);
1599
0
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1600
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
1601
0
    }
1602
1603
683k
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1604
683k
    m_entropyCoder.codePredInfo(cu, 0);
1605
683k
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1606
1607
683k
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1608
683k
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1609
683k
    m_entropyCoder.store(intraMode.contexts);
1610
683k
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1611
683k
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1612
683k
    const Yuv* fencYuv = intraMode.fencYuv;
1613
683k
    if (m_rdCost.m_psyRd)
1614
683k
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1615
25
    else if(m_rdCost.m_ssimRd)
1616
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1617
1618
683k
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1619
1620
683k
    updateModeCost(intraMode);
1621
683k
    checkDQP(intraMode, cuGeom);
1622
1623
#if ENABLE_SCC_EXT
1624
    if (m_param->bEnableSCC)
1625
        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
1626
#endif
1627
683k
}
1628
1629
/* Note that this function does not save the best intra prediction, it must
1630
 * be generated later. It records the best mode in the cu */
1631
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1632
0
{
1633
0
    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1634
1635
0
    CUData& cu = intraMode.cu;
1636
0
    uint32_t depth = cuGeom.depth;
1637
1638
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
1639
0
    cu.setPredModeSubParts(MODE_INTRA);
1640
1641
0
    const uint32_t initTuDepth = 0;
1642
0
    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
1643
0
    uint32_t tuSize = 1 << log2TrSize;
1644
0
    const uint32_t absPartIdx = 0;
1645
1646
    // Reference sample smoothing
1647
0
    IntraNeighbors intraNeighbors;
1648
0
    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1649
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1650
1651
0
    const pixel* fenc = intraMode.fencYuv->m_buf[0];
1652
0
    uint32_t stride = intraMode.fencYuv->m_size;
1653
1654
0
    int sad, bsad;
1655
0
    uint32_t bits, bbits, mode, bmode;
1656
0
    uint64_t cost, bcost;
1657
1658
    // 33 Angle modes once
1659
0
    int scaleTuSize = tuSize;
1660
0
    int scaleStride = stride;
1661
0
    int costShift = 0;
1662
0
    int sizeIdx = log2TrSize - 2;
1663
1664
0
    if (tuSize > 32)
1665
0
    {
1666
        // CU is 64x64, we scale to 32x32 and adjust required parameters
1667
0
        primitives.scale2D_64to32(m_fencScaled, fenc, stride);
1668
0
        fenc = m_fencScaled;
1669
1670
0
        pixel nScale[129];
1671
0
        intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
1672
0
        primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
1673
1674
        // we do not estimate filtering for downscaled samples
1675
0
        memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));   // Top & Left pixels
1676
0
        memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
1677
1678
0
        scaleTuSize = 32;
1679
0
        scaleStride = 32;
1680
0
        costShift = 2;
1681
0
        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1682
0
    }
1683
1684
0
    pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1685
0
    int predsize = scaleTuSize * scaleTuSize;
1686
1687
0
    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1688
1689
    /* there are three cost tiers for intra modes:
1690
     *  pred[0]          - mode probable, least cost
1691
     *  pred[1], pred[2] - less probable, slightly more cost
1692
     *  non-mpm modes    - all cost the same (rbits) */
1693
0
    uint64_t mpms;
1694
0
    uint32_t mpmModes[3];
1695
0
    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1696
1697
    // DC
1698
0
    primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1699
0
    bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1700
0
    bmode = mode = DC_IDX;
1701
0
    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1702
0
    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1703
1704
    // PLANAR
1705
0
    pixel* planar = intraNeighbourBuf[0];
1706
0
    if (tuSize & (8 | 16 | 32))
1707
0
        planar = intraNeighbourBuf[1];
1708
1709
0
    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
1710
0
    sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1711
0
    mode = PLANAR_IDX;
1712
0
    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1713
0
    cost = m_rdCost.calcRdSADCost(sad, bits);
1714
0
    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1715
1716
0
    bool allangs = true;
1717
0
    if (primitives.cu[sizeIdx].intra_pred_allangs)
1718
0
    {
1719
0
        primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1720
0
        primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
1721
0
    }
1722
0
    else
1723
0
        allangs = false;
1724
1725
0
#define TRY_ANGLE(angle) \
1726
0
    if (allangs) { \
1727
0
        if (angle < 18) \
1728
0
            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1729
0
        else \
1730
0
            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1731
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1732
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1733
0
    } else { \
1734
0
        int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
1735
0
        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
1736
0
        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
1737
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1738
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1739
0
    }
1740
1741
0
    if (m_param->bEnableFastIntra)
1742
0
    {
1743
0
        int asad = 0;
1744
0
        uint32_t lowmode, highmode, amode = 5, abits = 0;
1745
0
        uint64_t acost = MAX_INT64;
1746
1747
        /* pick the best angle, sampling at distance of 5 */
1748
0
        for (mode = 5; mode < 35; mode += 5)
1749
0
        {
1750
0
            TRY_ANGLE(mode);
1751
0
            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1752
0
        }
1753
1754
        /* refine best angle at distance 2, then distance 1 */
1755
0
        for (uint32_t dist = 2; dist >= 1; dist--)
1756
0
        {
1757
0
            lowmode = amode - dist;
1758
0
            highmode = amode + dist;
1759
1760
0
            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1761
0
            TRY_ANGLE(lowmode);
1762
0
            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1763
1764
0
            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1765
0
            TRY_ANGLE(highmode);
1766
0
            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1767
0
        }
1768
1769
0
        if (amode == 33)
1770
0
        {
1771
0
            TRY_ANGLE(34);
1772
0
            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1773
0
        }
1774
1775
0
        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1776
0
    }
1777
0
    else // calculate and search all intra prediction angles for lowest cost
1778
0
    {
1779
0
        for (mode = 2; mode < 35; mode++)
1780
0
        {
1781
0
            TRY_ANGLE(mode);
1782
0
            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1783
0
        }
1784
0
    }
1785
1786
0
    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1787
0
    intraMode.initCosts();
1788
0
    intraMode.totalBits = bbits;
1789
0
    intraMode.distortion = bsad;
1790
0
    intraMode.sa8dCost = bcost;
1791
0
    intraMode.sa8dBits = bbits;
1792
0
}
1793
1794
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1795
0
{
1796
0
    ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1797
1798
0
    CUData& cu = intraMode.cu;
1799
0
    Yuv* reconYuv = &intraMode.reconYuv;
1800
1801
0
    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1802
0
    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1803
1804
0
    uint32_t tuDepthRange[2];
1805
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1806
1807
0
    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1808
1809
0
    Cost icosts;
1810
0
    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1811
0
    extractIntraResultQT(cu, *reconYuv, 0, 0);
1812
1813
0
    intraMode.lumaDistortion = icosts.distortion;
1814
0
    if (m_csp != X265_CSP_I400)
1815
0
    {
1816
0
        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
1817
0
        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
1818
0
    }
1819
0
    else
1820
0
        intraMode.distortion = intraMode.lumaDistortion;
1821
1822
0
    m_entropyCoder.resetBits();
1823
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1824
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1825
0
    m_entropyCoder.codeSkipFlag(cu, 0);
1826
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1827
0
    m_entropyCoder.codePredMode(cu.m_predMode[0]);
1828
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1829
0
    m_entropyCoder.codePredInfo(cu, 0);
1830
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1831
1832
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1833
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1834
1835
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1836
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1837
0
    const Yuv* fencYuv = intraMode.fencYuv;
1838
0
    if (m_rdCost.m_psyRd)
1839
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1840
0
    else if(m_rdCost.m_ssimRd)
1841
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1842
1843
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1844
0
    m_entropyCoder.store(intraMode.contexts);
1845
0
    updateModeCost(intraMode);
1846
0
    checkDQP(intraMode, cuGeom);
1847
0
}
1848
1849
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
1850
683k
{
1851
683k
    CUData& cu = intraMode.cu;
1852
683k
    Yuv* reconYuv = &intraMode.reconYuv;
1853
683k
    Yuv* predYuv = &intraMode.predYuv;
1854
683k
    const Yuv* fencYuv = intraMode.fencYuv;
1855
1856
683k
    uint32_t depth        = cuGeom.depth;
1857
683k
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
1858
683k
    uint32_t numPU        = 1 << (2 * initTuDepth);
1859
683k
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
1860
683k
    uint32_t tuSize       = 1 << log2TrSize;
1861
683k
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1862
683k
    uint32_t sizeIdx      = log2TrSize - 2;
1863
683k
    uint32_t absPartIdx   = 0;
1864
683k
    sse_t totalDistortion = 0;
1865
1866
683k
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1867
1868
    // loop over partitions
1869
2.25M
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1870
1.57M
    {
1871
1.57M
        uint32_t bmode = 0;
1872
1873
1.57M
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
1874
0
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
1875
1.57M
        else
1876
1.57M
        {
1877
1.57M
            uint64_t candCostList[MAX_RD_INTRA_MODES];
1878
1.57M
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
1879
1.57M
            uint64_t bcost;
1880
1.57M
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1881
1882
1.57M
            {
1883
1.57M
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1884
1885
                // Reference sample smoothing
1886
1.57M
                IntraNeighbors intraNeighbors;
1887
1.57M
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1888
1.57M
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1889
1890
                // determine set of modes to be tested (using prediction signal only)
1891
1.57M
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1892
1.57M
                uint32_t stride = predYuv->m_size;
1893
1894
1.57M
                int scaleTuSize = tuSize;
1895
1.57M
                int scaleStride = stride;
1896
1.57M
                int costShift = 0;
1897
1898
1.57M
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1899
1900
                /* there are three cost tiers for intra modes:
1901
                *  pred[0]          - mode probable, least cost
1902
                *  pred[1], pred[2] - less probable, slightly more cost
1903
                *  non-mpm modes    - all cost the same (rbits) */
1904
1.57M
                uint64_t mpms;
1905
1.57M
                uint32_t mpmModes[3];
1906
1.57M
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1907
1908
1.57M
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1909
1.57M
                uint64_t modeCosts[35];
1910
1911
                // DC
1912
1.57M
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1913
1.57M
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
1914
1.57M
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1915
1.57M
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1916
1917
                // PLANAR
1918
1.57M
                pixel* planar = intraNeighbourBuf[0];
1919
1.57M
                if (tuSize >= 8 && tuSize <= 32)
1920
385k
                    planar = intraNeighbourBuf[1];
1921
1922
1.57M
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
1923
1.57M
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
1924
1.57M
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1925
1.57M
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1926
1.57M
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1927
1928
                // angular predictions
1929
1.57M
                if (primitives.cu[sizeIdx].intra_pred_allangs)
1930
0
                {
1931
0
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1932
0
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
1933
0
                    for (int mode = 2; mode < 35; mode++)
1934
0
                    {
1935
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1936
0
                        if (mode < 18)
1937
0
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1938
0
                        else
1939
0
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1940
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1941
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1942
0
                    }
1943
0
                }
1944
1.57M
                else
1945
1.57M
                {
1946
52.7M
                    for (int mode = 2; mode < 35; mode++)
1947
51.2M
                    {
1948
51.2M
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1949
51.2M
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
1950
51.2M
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
1951
51.2M
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
1952
51.2M
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1953
51.2M
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1954
51.2M
                    }
1955
1.57M
                }
1956
1957
                /* Find the top maxCandCount candidate modes with cost within 25% of best
1958
                * or among the most probable modes. maxCandCount is derived from the
1959
                * rdLevel and depth. In general we want to try more modes at slower RD
1960
                * levels and at higher depths */
1961
13.6M
                for (int i = 0; i < maxCandCount; i++)
1962
12.0M
                    candCostList[i] = MAX_INT64;
1963
1964
1.57M
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
1965
56.7M
                for (int mode = 0; mode < 35; mode++)
1966
55.1M
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
1967
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
1968
1.66M
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1969
1.57M
            }
1970
1971
            /* measure best candidates using simple RDO (no TU splits) */
1972
1.57M
            bcost = MAX_INT64;
1973
3.17M
            for (int i = 0; i < maxCandCount; i++)
1974
3.17M
            {
1975
3.17M
                if (candCostList[i] == MAX_INT64)
1976
1.57M
                    break;
1977
1978
1.59M
                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1979
1980
1.59M
                m_entropyCoder.load(m_rqt[depth].cur);
1981
1.59M
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1982
1983
1.59M
                Cost icosts;
1984
1.59M
                if (checkTransformSkip)
1985
0
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1986
1.59M
                else
1987
1.59M
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1988
1.59M
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1989
1.59M
            }
1990
1.57M
        }
1991
1992
1.57M
        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1993
1994
        /* remeasure best mode, allowing TU splits */
1995
1.57M
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
1996
1.57M
        m_entropyCoder.load(m_rqt[depth].cur);
1997
1998
1.57M
        Cost icosts;
1999
1.57M
        if (checkTransformSkip)
2000
0
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
2001
1.57M
        else
2002
1.57M
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
2003
1.57M
        totalDistortion += icosts.distortion;
2004
2005
1.57M
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
2006
2007
        // set reconstruction for next intra prediction blocks
2008
1.57M
        if (puIdx != numPU - 1)
2009
895k
        {
2010
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
2011
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
2012
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
2013
             * that the contexts should be tracked through each PU */
2014
895k
            PicYuv*  reconPic = m_frame->m_reconPic[0];
2015
895k
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
2016
895k
            uint32_t dststride = reconPic->m_stride;
2017
895k
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
2018
895k
            uint32_t srcstride = reconYuv->m_size;
2019
895k
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
2020
895k
        }
2021
1.57M
    }
2022
2023
683k
    if (numPU > 1)
2024
298k
    {
2025
298k
        uint32_t combCbfY = 0;
2026
1.49M
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2027
1.19M
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
2028
2029
298k
        cu.m_cbf[0][0] |= combCbfY;
2030
298k
    }
2031
2032
    // TODO: remove this
2033
683k
    m_entropyCoder.load(m_rqt[depth].cur);
2034
2035
683k
    return totalDistortion;
2036
683k
}
2037
2038
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
2039
0
{
2040
0
    CUData& cu = intraMode.cu;
2041
0
    const Yuv* fencYuv = intraMode.fencYuv;
2042
0
    Yuv* predYuv = &intraMode.predYuv;
2043
2044
0
    uint32_t bestMode  = 0;
2045
0
    uint64_t bestCost  = MAX_INT64;
2046
0
    uint32_t modeList[NUM_CHROMA_MODE];
2047
2048
0
    uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
2049
0
    uint32_t tuSize = 1 << log2TrSizeC;
2050
0
    uint32_t tuDepth = 0;
2051
0
    int32_t costShift = 0;
2052
2053
0
    if (tuSize > 32)
2054
0
    {
2055
0
        tuDepth = 1;
2056
0
        costShift = 2;
2057
0
        log2TrSizeC = 5;
2058
0
    }
2059
2060
0
    IntraNeighbors intraNeighbors;
2061
0
    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
2062
0
    cu.getAllowedChromaDir(0, modeList);
2063
2064
    // check chroma modes
2065
0
    for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
2066
0
    {
2067
0
        uint32_t chromaPredMode = modeList[mode];
2068
0
        if (chromaPredMode == DM_CHROMA_IDX)
2069
0
            chromaPredMode = cu.m_lumaIntraDir[0];
2070
0
        if (m_csp == X265_CSP_I422)
2071
0
            chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
2072
2073
0
        uint64_t cost = 0;
2074
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2075
0
        {
2076
0
            const pixel* fenc = fencYuv->m_buf[chromaId];
2077
0
            pixel* pred = predYuv->m_buf[chromaId];
2078
0
            Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
2079
            // get prediction signal
2080
0
            predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
2081
0
            cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
2082
0
        }
2083
2084
0
        if (cost < bestCost)
2085
0
        {
2086
0
            bestCost = cost;
2087
0
            bestMode = modeList[mode];
2088
0
        }
2089
0
    }
2090
2091
0
    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
2092
0
}
2093
2094
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
2095
683k
{
2096
683k
    CUData& cu = intraMode.cu;
2097
683k
    Yuv& reconYuv = intraMode.reconYuv;
2098
2099
683k
    uint32_t depth       = cuGeom.depth;
2100
683k
    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
2101
683k
    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
2102
683k
    uint32_t absPartStep = cuGeom.numPartitions;
2103
683k
    sse_t totalDistortion = 0;
2104
2105
683k
    int size = partitionFromLog2Size(log2TrSize);
2106
2107
683k
    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
2108
2109
683k
    do
2110
683k
    {
2111
683k
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2112
2113
683k
        uint32_t bestMode = 0;
2114
683k
        sse_t bestDist = 0;
2115
683k
        uint64_t bestCost = MAX_INT64;
2116
2117
        // init mode list
2118
683k
        uint32_t minMode = 0;
2119
683k
        uint32_t maxMode = NUM_CHROMA_MODE;
2120
683k
        uint32_t modeList[NUM_CHROMA_MODE];
2121
2122
683k
        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
2123
0
        {
2124
0
            for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
2125
0
                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
2126
0
            maxMode = 1;
2127
0
        }
2128
683k
        else
2129
683k
            cu.getAllowedChromaDir(absPartIdxC, modeList);
2130
2131
683k
        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
2132
0
        {
2133
0
            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
2134
0
                modeList[l] = modeList[0];
2135
0
            maxMode = 1;
2136
0
        }
2137
        // check chroma modes
2138
4.09M
        for (uint32_t mode = minMode; mode < maxMode; mode++)
2139
3.40M
        {
2140
            // restore context models
2141
3.40M
            m_entropyCoder.load(m_rqt[depth].cur);
2142
2143
3.40M
            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
2144
3.40M
            Cost outCost;
2145
3.40M
            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
2146
2147
3.40M
            if (m_slice->m_pps->bTransformSkipEnabled)
2148
0
                m_entropyCoder.load(m_rqt[depth].cur);
2149
2150
3.40M
            m_entropyCoder.resetBits();
2151
            // chroma prediction mode
2152
3.40M
            if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
2153
3.41M
            {
2154
3.41M
                if (!absPartIdxC)
2155
3.41M
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
2156
3.41M
            }
2157
18.4E
            else
2158
18.4E
            {
2159
18.4E
                uint32_t qNumParts = cuGeom.numPartitions >> 2;
2160
18.4E
                if (!(absPartIdxC & (qNumParts - 1)))
2161
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
2162
18.4E
            }
2163
2164
3.40M
            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
2165
3.40M
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
2166
3.40M
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
2167
3.40M
            uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
2168
18.4E
            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
2169
18.4E
                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
2170
2171
3.40M
            if (cost < bestCost)
2172
1.05M
            {
2173
1.05M
                bestCost = cost;
2174
1.05M
                bestDist = outCost.distortion;
2175
1.05M
                bestMode = modeList[mode];
2176
1.05M
                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
2177
1.05M
                memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2178
1.05M
                memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2179
1.05M
                memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2180
1.05M
                memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2181
1.05M
            }
2182
3.40M
        }
2183
2184
683k
        if (!tuIterator.isLastSection())
2185
0
        {
2186
0
            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
2187
0
            PicYuv*  reconPic  = m_frame->m_reconPic[0];
2188
0
            uint32_t dststride = reconPic->m_strideC;
2189
0
            const pixel* src;
2190
0
            pixel* dst;
2191
2192
0
            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
2193
0
            src = reconYuv.getCbAddr(absPartIdxC);
2194
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
2195
2196
0
            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
2197
0
            src = reconYuv.getCrAddr(absPartIdxC);
2198
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
2199
0
        }
2200
2201
683k
        memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
2202
683k
        memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
2203
683k
        memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
2204
683k
        memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
2205
683k
        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
2206
683k
        totalDistortion += bestDist;
2207
683k
    }
2208
683k
    while (tuIterator.isNextSection());
2209
2210
683k
    if (initTuDepth != 0)
2211
0
    {
2212
0
        uint32_t combCbfU = 0;
2213
0
        uint32_t combCbfV = 0;
2214
0
        uint32_t qNumParts = tuIterator.absPartIdxStep;
2215
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2216
0
        {
2217
0
            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
2218
0
            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
2219
0
        }
2220
2221
0
        cu.m_cbf[1][0] |= combCbfU;
2222
0
        cu.m_cbf[2][0] |= combCbfV;
2223
0
    }
2224
2225
    /* TODO: remove this */
2226
683k
    m_entropyCoder.load(m_rqt[depth].cur);
2227
683k
    return totalDistortion;
2228
683k
}
2229
2230
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
2231
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
2232
0
{
2233
0
    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
2234
2235
0
    MVField  candMvField[MRG_MAX_NUM_CANDS][2];
2236
0
    uint8_t  candDir[MRG_MAX_NUM_CANDS];
2237
0
    uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
2238
#if ENABLE_SCC_EXT
2239
    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
2240
#else
2241
0
    if (cu.isBipredRestriction())
2242
0
    {
2243
        /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2244
0
        for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
2245
0
        {
2246
0
            if (candDir[mergeCand] == 3)
2247
0
            {
2248
0
                candDir[mergeCand] = 1;
2249
0
                candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
2250
0
            }
2251
0
        }
2252
0
    }
2253
0
#endif
2254
2255
0
    Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2256
2257
0
    uint32_t outCost = MAX_UINT;
2258
0
    for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
2259
0
    {
2260
        /* Prevent TMVP candidates from using unavailable reference pixels */
2261
0
        if (m_bFrameParallel)
2262
0
        {
2263
            // Parallel slices bound check
2264
0
            if (m_param->maxSlices > 1)
2265
0
            {
2266
0
                if (cu.m_bFirstRowInSlice &
2267
0
                    ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
2268
0
                    continue;
2269
2270
                // Last row in slice can't reference beyond bound since it is another slice area
2271
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2272
0
                if (cu.m_bLastRowInSlice &&
2273
0
                    ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
2274
0
                    continue;
2275
0
            }
2276
2277
0
            if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2278
0
                candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
2279
0
                continue;
2280
0
        }
2281
2282
#if ENABLE_SCC_EXT
2283
        if ((candDir[mergeCand] == 1 || candDir[mergeCand] == 3) && (m_slice->m_refPOCList[0][candMvField[mergeCand][0].refIdx] == m_slice->m_poc))
2284
        {
2285
            continue;
2286
        }
2287
#endif
2288
0
        cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
2289
0
        cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
2290
0
        cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
2291
0
        cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
2292
2293
0
        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
2294
2295
0
        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
2296
0
        if (m_me.bChromaSATD)
2297
0
            costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
2298
2299
0
        uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
2300
0
        costCand = costCand + m_rdCost.getCost(bitsCand);
2301
0
        if (costCand < outCost)
2302
0
        {
2303
0
            outCost = costCand;
2304
0
            m.bits = bitsCand;
2305
0
            m.index = mergeCand;
2306
0
        }
2307
0
    }
2308
2309
0
    m.mvField[0] = candMvField[m.index][0];
2310
0
    m.mvField[1] = candMvField[m.index][1];
2311
0
    m.dir = candDir[m.index];
2312
2313
0
    return outCost;
2314
0
}
2315
2316
/* find the lowres motion vector from lookahead in middle of current PU */
2317
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
2318
0
{
2319
0
    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
2320
0
    if (diffPoc > m_param->bframes + 1)
2321
        /* poc difference is out of range for lookahead */
2322
0
        return 0;
2323
2324
0
    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc];
2325
0
    if (mvs[0].x == 0x7FFF)
2326
        /* this motion search was not estimated by lookahead */
2327
0
        return 0;
2328
2329
0
    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
2330
0
    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
2331
0
    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
2332
2333
0
    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
2334
0
    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
2335
2336
0
    return mvs[idx] << 1; /* scale up lowres mv */
2337
0
}
2338
2339
/* Pick between the two AMVP candidates which is the best one to use as
2340
 * MVP for the motion search, based on SAD cost */
2341
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
2342
0
{
2343
0
    if (amvp[0] == amvp[1])
2344
0
        return 0;
2345
2346
0
    Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
2347
0
    uint32_t costs[AMVP_NUM_CANDS];
2348
2349
0
    for (int i = 0; i < AMVP_NUM_CANDS; i++)
2350
0
    {
2351
0
        MV mvCand = amvp[i];
2352
2353
        // NOTE: skip mvCand if Y is > merange and -FN>1
2354
0
        if (m_bFrameParallel)
2355
0
        {
2356
0
            costs[i] = m_me.COST_MAX;
2357
2358
0
            if (mvCand.y >= (m_param->searchRange + 1) * 4)
2359
0
                continue;
2360
2361
0
            if ((m_param->maxSlices > 1) &
2362
0
                ((mvCand.y < m_sliceMinY)
2363
0
              |  (mvCand.y > m_sliceMaxY)))
2364
0
                continue;
2365
0
        }
2366
0
        cu.clipMv(mvCand);
2367
#if ENABLE_SCC_EXT
2368
        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx[0] - 1)
2369
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameList[list][ref]->m_reconPic[1], mvCand);
2370
        else
2371
#endif
2372
0
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
2373
0
        costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
2374
0
    }
2375
2376
0
    return (costs[0] <= costs[1]) ? 0 : 1;
2377
0
}
2378
2379
void Search::PME::processTasks(int workerThreadId)
2380
0
{
2381
#if DETAILED_CU_STATS
2382
    int fe = mode.cu.m_encData->m_frameEncoderID;
2383
    master.m_stats[fe].countPMETasks++;
2384
    ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
2385
#endif
2386
0
    ProfileScopeEvent(pme);
2387
0
    master.processPME(*this, master.m_tld[workerThreadId].analysis);
2388
0
}
2389
2390
void Search::processPME(PME& pme, Search& slave)
2391
0
{
2392
    /* acquire a motion estimation job, else exit early */
2393
0
    int meId;
2394
0
    pme.m_lock.acquire();
2395
0
    if (pme.m_jobTotal > pme.m_jobAcquired)
2396
0
    {
2397
0
        meId = pme.m_jobAcquired++;
2398
0
        pme.m_lock.release();
2399
0
    }
2400
0
    else
2401
0
    {
2402
0
        pme.m_lock.release();
2403
0
        return;
2404
0
    }
2405
2406
    /* Setup slave Search instance for ME for master's CU */
2407
0
    if (&slave != this)
2408
0
    {
2409
0
        slave.m_slice = m_slice;
2410
0
        slave.m_frame = m_frame;
2411
0
        slave.m_param = m_param;
2412
0
        slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
2413
0
        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
2414
0
        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
2415
0
    }
2416
2417
    /* Perform ME, repeat until no more work is available */
2418
0
    do
2419
0
    {
2420
0
        if (meId < pme.m_jobs.refCnt[0])
2421
0
        {
2422
0
            int refIdx = pme.m_jobs.ref[0][meId]; //L0
2423
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
2424
0
        }
2425
0
        else
2426
0
        {
2427
0
            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
2428
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
2429
0
        }
2430
2431
0
        meId = -1;
2432
0
        pme.m_lock.acquire();
2433
0
        if (pme.m_jobTotal > pme.m_jobAcquired)
2434
0
            meId = pme.m_jobAcquired++;
2435
0
        pme.m_lock.release();
2436
0
    }
2437
0
    while (meId >= 0);
2438
0
}
2439
2440
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
2441
0
{
2442
0
    uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
2443
0
    int numIdx = m_slice->m_numRefIdx[list];
2444
#if ENABLE_SCC_EXT
2445
    if (!list && m_ibcEnabled)
2446
        numIdx--;
2447
#endif
2448
0
    bits += getTUBits(ref, numIdx);
2449
2450
0
    MotionData* bestME = interMode.bestME[part];
2451
2452
    // 12 mv candidates including lowresMV
2453
0
    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2454
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2455
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, 0, pu.puAbsPartIdx);
2456
#else
2457
0
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2458
0
#endif
2459
2460
0
    const MV* amvp = interMode.amvpCand[list][ref];
2461
0
    int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
2462
0
    bool bLowresMVP = false;
2463
0
    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2464
2465
0
    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging if lowresMV is not available */
2466
0
    {
2467
0
        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
2468
0
        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2469
0
        if (lmv.notZero() && !layer)
2470
0
            mvc[numMvc++] = lmv;
2471
0
        if (m_param->bEnableHME)
2472
0
            mvp_lowres = lmv;
2473
0
    }
2474
2475
0
    m_vertRestriction = interMode.cu.m_slice->m_refPOCList[list][ref] == interMode.cu.m_slice->m_poc;
2476
0
    setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
2477
2478
0
    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2479
0
      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2480
2481
0
    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2482
0
    {
2483
0
        MV outmv_lowres;
2484
0
        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2485
0
        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2486
0
            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2487
0
        if (lowresMvCost < satdCost)
2488
0
        {
2489
0
            outmv = outmv_lowres;
2490
0
            satdCost = lowresMvCost;
2491
0
            bLowresMVP = true;
2492
0
        }
2493
0
    }
2494
    /* Get total cost of partition, but only include MV bit cost once */
2495
0
    bits += m_me.bitcost(outmv);
2496
0
    uint32_t mvCost = m_me.mvcost(outmv);
2497
0
    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2498
2499
    /* Update LowresMVP to best AMVP cand*/
2500
0
    if (bLowresMVP)
2501
0
        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2502
2503
    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2504
0
    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2505
2506
    /* tie goes to the smallest ref ID, just like --no-pme */
2507
0
    ScopedLock _lock(master.m_meLock);
2508
0
    if (cost < bestME[list].cost ||
2509
0
       (cost == bestME[list].cost && ref < bestME[list].ref))
2510
0
    {
2511
0
        bestME[list].mv = outmv;
2512
0
        bestME[list].mvp = mvp;
2513
0
        bestME[list].mvpIdx = mvpIdx;
2514
0
        bestME[list].ref = ref;
2515
0
        bestME[list].cost = cost;
2516
0
        bestME[list].bits = bits;
2517
0
        bestME[list].mvCost  = mvCost;
2518
0
    }
2519
0
}
2520
void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc)
2521
0
{
2522
0
    CUData& cu = interMode.cu;
2523
0
    MV mv, mvmin, mvmax;
2524
0
    int cand = 0, bestcost = INT_MAX;
2525
0
    while (cand < m_param->mvRefine)
2526
0
    {
2527
0
        if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1])))
2528
0
        {
2529
0
            cand++;
2530
0
            continue;
2531
0
        }
2532
0
        MV bestMV;
2533
0
        mv = mvp[cand++];
2534
0
        cu.clipMv(mv);
2535
0
        m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2536
0
        setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
2537
0
        int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices, m_vertRestriction,
2538
0
        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2539
0
        if (bestcost > cost)
2540
0
        {
2541
0
            bestcost = cost;
2542
0
            outmv = bestMV;
2543
0
        }
2544
0
    }
2545
0
}
2546
/* find the best inter prediction for each PU of specified mode */
2547
#if ENABLE_SCC_EXT
2548
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2], MV* iMVCandList)
2549
#else
2550
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
2551
#endif
2552
0
{
2553
0
    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
2554
2555
0
    CUData& cu = interMode.cu;
2556
0
    Yuv* predYuv = &interMode.predYuv;
2557
2558
    // 12 mv candidates including lowresMV
2559
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2560
2561
0
    const Slice *slice = m_slice;
2562
0
    int numPart     = cu.getNumPartInter(0);
2563
0
    int numPredDir  = slice->isInterP() ? 1 : 2;
2564
0
    const int* numRefIdx = slice->m_numRefIdx;
2565
0
    uint32_t lastMode = 0;
2566
0
    int      totalmebits = 0;
2567
0
    MV       mvzero(0, 0);
2568
0
    Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2569
0
    MergeData merge;
2570
0
    memset(&merge, 0, sizeof(merge));
2571
0
    bool useAsMVP = false;
2572
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
2573
0
    {
2574
0
        MotionData* bestME = interMode.bestME[puIdx];
2575
0
        PredictionUnit pu(cu, cuGeom, puIdx);
2576
0
        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
2577
0
        useAsMVP = false;
2578
0
        x265_analysis_inter_data* interDataCTU = NULL;
2579
0
        int cuIdx;
2580
0
        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
2581
0
        if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1)
2582
0
        {
2583
0
            interDataCTU = m_frame->m_analysisData.interData;
2584
0
            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
2585
0
                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
2586
0
                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
2587
0
                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
2588
0
                useAsMVP = true;
2589
0
        }
2590
        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
2591
0
        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
2592
0
        bestME[0].cost = MAX_UINT;
2593
0
        bestME[1].cost = MAX_UINT;
2594
2595
0
        getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
2596
0
        bool bDoUnidir = true;
2597
2598
0
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
2599
        /* Uni-directional prediction */
2600
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2601
0
            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
2602
0
        {
2603
0
            for (int list = 0; list < numPredDir; list++)
2604
0
            {
2605
2606
0
                int ref = -1;
2607
0
                if (useAsMVP)
2608
0
                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
2609
0
                else
2610
0
                    ref = bestME[list].ref;
2611
0
                if (ref < 0)
2612
0
                {
2613
0
                    continue;
2614
0
                }
2615
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2616
0
                int numIdx = m_slice->m_numRefIdx[list];
2617
#if ENABLE_SCC_EXT
2618
                if (!list && m_ibcEnabled)
2619
                    numIdx--;
2620
#endif
2621
0
                bits += getTUBits(ref, numIdx);
2622
2623
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2624
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2625
#else
2626
0
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2627
0
#endif
2628
0
                const MV* amvp = interMode.amvpCand[list][ref];
2629
0
                int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2630
0
                MV mvmin, mvmax, outmv, mvp;
2631
0
                if (useAsMVP)
2632
0
                {
2633
0
                    mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
2634
0
                    mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
2635
0
                }
2636
0
                else
2637
0
                    mvp = amvp[mvpIdx];
2638
0
                if (m_param->searchMethod == X265_SEA)
2639
0
                {
2640
0
                    int puX = puIdx & 1;
2641
0
                    int puY = puIdx >> 1;
2642
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2643
0
                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2644
0
                }
2645
0
                setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2646
0
                MV mvpIn = mvp;
2647
0
                int satdCost;
2648
0
                if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
2649
0
                    mvpIn = bestME[list].mv;
2650
0
                if (useAsMVP && m_param->mvRefine > 1)
2651
0
                {
2652
0
                    MV bestmv, mvpSel[3];
2653
0
                    int mvpIdxSel[3];
2654
0
                    satdCost = m_me.COST_MAX;
2655
0
                    mvpSel[0] = mvp;
2656
0
                    mvpIdxSel[0] = mvpIdx;
2657
0
                    mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2658
0
                    mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
2659
0
                    mvpIdxSel[1] = mvpIdx;
2660
0
                    if (m_param->mvRefine > 2)
2661
0
                    {
2662
0
                        mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
2663
0
                        mvpIdxSel[2] = !mvpIdx;
2664
0
                    }
2665
0
                    for (int cand = 0; cand < m_param->mvRefine; cand++)
2666
0
                    {
2667
0
                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
2668
0
                            continue;
2669
0
                        setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);
2670
0
                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices, m_vertRestriction,
2671
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2672
0
                        if (satdCost > bcost)
2673
0
                        {
2674
0
                            satdCost = bcost;
2675
0
                            outmv = bestmv;
2676
0
                            mvp = mvpSel[cand];
2677
0
                            mvpIdx = mvpIdxSel[cand];
2678
0
                        }
2679
0
                    }
2680
0
                    mvpIn = mvp;
2681
0
                }
2682
0
                else
2683
0
                {
2684
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2685
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2686
0
                }
2687
2688
                /* Get total cost of partition, but only include MV bit cost once */
2689
0
                bits += m_me.bitcost(outmv);
2690
0
                uint32_t mvCost = m_me.mvcost(outmv);
2691
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2692
                /* Refine MVP selection, updates: mvpIdx, bits, cost */
2693
0
                if (!(m_param->analysisMultiPassRefine || useAsMVP))
2694
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2695
0
                else
2696
0
                {
2697
                    /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here 
2698
                      the actual mvp is bestME from pass 1 for that mvpIdx */
2699
0
                    int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
2700
0
                    if (diffBits < 0)
2701
0
                    {
2702
0
                        mvpIdx = !mvpIdx;
2703
0
                        uint32_t origOutBits = bits;
2704
0
                        bits = origOutBits + diffBits;
2705
0
                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
2706
0
                    }
2707
0
                    mvp = amvp[mvpIdx];
2708
0
                }
2709
2710
0
                if (cost < bestME[list].cost)
2711
0
                {
2712
0
                    bestME[list].mv = outmv;
2713
0
                    bestME[list].mvp = mvp;
2714
0
                    bestME[list].mvpIdx = mvpIdx;
2715
0
                    bestME[list].cost = cost;
2716
0
                    bestME[list].bits = bits;
2717
0
                    bestME[list].mvCost  = mvCost;
2718
0
                    bestME[list].ref = ref;
2719
0
                }
2720
0
                bDoUnidir = false;
2721
0
            }            
2722
0
        }
2723
0
        else if (m_param->bDistributeMotionEstimation)
2724
0
        {
2725
0
            PME pme(*this, interMode, cuGeom, pu, puIdx);
2726
0
            pme.m_jobTotal = 0;
2727
0
            pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
2728
2729
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2730
0
            for (int list = 0; list < numPredDir; list++)
2731
0
            {
2732
0
                int idx = 0;
2733
0
                int numIdx = numRefIdx[list];
2734
#if ENABLE_SCC_EXT
2735
                if (!list && m_ibcEnabled)
2736
                    numIdx--;
2737
#endif
2738
0
                for (int ref = 0; ref < numIdx; ref++)
2739
0
                {
2740
0
                    if (!(refMask & (1 << ref)))
2741
0
                        continue;
2742
2743
0
                    pme.m_jobs.ref[list][idx++]  = ref;
2744
0
                    pme.m_jobTotal++;
2745
0
                }
2746
0
                pme.m_jobs.refCnt[list] = idx;
2747
2748
                /* the second list ref bits start at bit 16 */
2749
0
                refMask >>= 16;
2750
0
            }
2751
2752
0
            if (pme.m_jobTotal > 2)
2753
0
            {
2754
0
                pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
2755
2756
0
                processPME(pme, *this);
2757
2758
0
                int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
2759
0
                singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
2760
2761
0
                bDoUnidir = false;
2762
2763
0
                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
2764
0
                pme.waitForExit();
2765
0
            }
2766
2767
            /* if no peer threads were bonded, fall back to doing unidirectional
2768
             * searches ourselves without overhead of singleMotionEstimation() */
2769
0
        }
2770
0
        if (bDoUnidir && !m_param->bThreadedME)
2771
0
        {
2772
0
            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
2773
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2774
2775
0
            for (int list = 0; list < numPredDir; list++)
2776
0
            {
2777
0
                int numIdx = numRefIdx[list];
2778
#if ENABLE_SCC_EXT
2779
                if (!list && m_ibcEnabled)
2780
                    numIdx--;
2781
#endif
2782
0
                for (int ref = 0; ref < numIdx; ref++)
2783
0
                {
2784
0
                    ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
2785
2786
0
                    if (!(refMask & (1 << ref)))
2787
0
                    {
2788
0
                        ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
2789
0
                        continue;
2790
0
                    }
2791
2792
0
                    uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2793
0
                    bits += getTUBits(ref, numIdx);
2794
2795
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2796
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2797
#else
2798
0
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2799
0
#endif
2800
2801
0
                    const MV* amvp = interMode.amvpCand[list][ref];
2802
0
                    int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2803
0
                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2804
0
                    bool bLowresMVP = false;
2805
2806
0
                    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging when lowresMV is not available */
2807
0
                    {
2808
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
2809
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2810
0
                        if (lmv.notZero() && !layer)
2811
0
                            mvc[numMvc++] = lmv;
2812
0
                        if (m_param->bEnableHME)
2813
0
                            mvp_lowres = lmv;
2814
0
                    }
2815
0
                    if (m_param->searchMethod == X265_SEA)
2816
0
                    {
2817
0
                        int puX = puIdx & 1;
2818
0
                        int puY = puIdx >> 1;
2819
0
                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2820
0
                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2821
0
                    }
2822
0
                    m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2823
0
                    setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2824
0
                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2825
0
                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2826
2827
0
                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2828
0
                    {
2829
0
                        MV outmv_lowres;
2830
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2831
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2832
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2833
0
                        if (lowresMvCost < satdCost)
2834
0
                        {
2835
0
                            outmv = outmv_lowres;
2836
0
                            satdCost = lowresMvCost;
2837
0
                            bLowresMVP = true;
2838
0
                        }
2839
0
                    }
2840
2841
                    /* Get total cost of partition, but only include MV bit cost once */
2842
0
                    bits += m_me.bitcost(outmv);
2843
0
                    uint32_t mvCost = m_me.mvcost(outmv);
2844
0
                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2845
                    /* Update LowresMVP to best AMVP cand*/
2846
0
                    if (bLowresMVP)
2847
0
                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2848
2849
                    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2850
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2851
2852
#if ENABLE_SCC_EXT
2853
                    if (m_param->bEnableSCC && (list <= 1 && ref <= 1 && (cu.m_partSize[0] == SIZE_2NxN || cu.m_partSize[0] == SIZE_Nx2N) && (1 << cu.m_log2CUSize[0]) <= 16))
2854
                    {
2855
                        iMVCandList[4 * list + 2 * ref + puIdx] = outmv;
2856
                    }
2857
#endif
2858
2859
0
                    if (cost < bestME[list].cost)
2860
0
                    {
2861
0
                        bestME[list].mv      = outmv;
2862
0
                        bestME[list].mvp     = mvp;
2863
0
                        bestME[list].mvpIdx  = mvpIdx;
2864
0
                        bestME[list].ref     = ref;
2865
0
                        bestME[list].cost    = cost;
2866
0
                        bestME[list].bits    = bits;
2867
0
                        bestME[list].mvCost  = mvCost;
2868
0
                    }
2869
0
                }
2870
                /* the second list ref bits start at bit 16 */
2871
0
                refMask >>= 16;
2872
0
            }
2873
0
        }
2874
2875
        /* Bi-directional prediction */
2876
0
        MotionData bidir[2];
2877
0
        uint32_t bidirCost = MAX_UINT;
2878
0
        int bidirBits = 0;
2879
2880
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
2881
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
2882
0
            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && !m_param->bThreadedME)
2883
0
        {
2884
0
            bidir[0] = bestME[0];
2885
0
            bidir[1] = bestME[1];
2886
2887
0
            int satdCost;
2888
2889
0
            if (m_me.bChromaSATD)
2890
0
            {
2891
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
2892
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2893
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
2894
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2895
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
2896
2897
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2898
0
                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2899
0
            }
2900
0
            else
2901
0
            {
2902
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
2903
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
2904
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2905
2906
                /* Generate reference subpels */
2907
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
2908
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
2909
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
2910
0
                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
2911
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2912
0
            }
2913
2914
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2915
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2916
2917
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2918
0
            if (bTryZero)
2919
0
            {
2920
                /* Do not try zero MV if unidir motion predictors are beyond
2921
                 * valid search area */
2922
0
                MV mvmin, mvmax;
2923
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2924
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2925
0
                mvmax.y += 2; // there is some pad for subpel refine
2926
0
                mvmin <<= 2;
2927
0
                mvmax <<= 2;
2928
2929
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2930
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2931
0
            }
2932
0
            if (bTryZero)
2933
0
            {
2934
                /* coincident blocks of the two reference pictures */
2935
0
                if (m_me.bChromaSATD)
2936
0
                {
2937
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
2938
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2939
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
2940
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2941
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
2942
2943
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2944
0
                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2945
0
                }
2946
0
                else
2947
0
                {
2948
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2949
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2950
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
2951
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
2952
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2953
0
                }
2954
0
                MV mvp0 = bestME[0].mvp;
2955
0
                int mvpIdx0 = bestME[0].mvpIdx;
2956
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
2957
2958
0
                MV mvp1 = bestME[1].mvp;
2959
0
                int mvpIdx1 = bestME[1].mvpIdx;
2960
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
2961
2962
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
2963
2964
                /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2965
0
                mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
2966
0
                mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
2967
2968
0
                if (cost < bidirCost)
2969
0
                {
2970
0
                    bidir[0].mv = mvzero;
2971
0
                    bidir[1].mv = mvzero;
2972
0
                    bidir[0].mvp = mvp0;
2973
0
                    bidir[1].mvp = mvp1;
2974
0
                    bidir[0].mvpIdx = mvpIdx0;
2975
0
                    bidir[1].mvpIdx = mvpIdx1;
2976
0
                    bidirCost = cost;
2977
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2978
0
                }
2979
0
            }
2980
0
        }
2981
2982
0
        uint32_t bestCost = MAX_INT;
2983
0
        bool isMerge = false;
2984
0
        bool isBidir = false;
2985
0
        bool uniL0 = false;
2986
0
        bool uniL1 = false;
2987
2988
0
        if (m_param->bThreadedME)
2989
0
        {
2990
0
            int cuSize = 1 << cu.m_log2CUSize[0];
2991
2992
0
            int lookupWidth = pu.width;
2993
0
            int lookupHeight = pu.height;
2994
2995
0
            bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU;
2996
2997
0
            if (isAmp)
2998
0
            {
2999
0
                if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == SIZE_2NxnD)
3000
0
                    lookupHeight = (puIdx) ? (pu.width - pu.height) : pu.height;
3001
0
                else
3002
0
                    lookupWidth = (puIdx) ? (pu.height - pu.width) : pu.width;
3003
0
            }
3004
3005
0
            int startIdx = g_puStartIdx[lookupWidth + lookupHeight][static_cast<int>(cu.m_partSize[0])];
3006
3007
0
            int alignWidth = isAmp ? cuSize : pu.width;
3008
0
            int alignHeight = isAmp ? cuSize : pu.height;
3009
3010
0
            int numPUX = m_param->maxCUSize / alignWidth;
3011
0
            int numPUY = m_param->maxCUSize / alignHeight;
3012
3013
0
            int puOffset = isAmp ? (puIdx * numPUX * numPUY) : (cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx);
3014
 
3015
0
            int relX = (cu.m_cuPelX / alignWidth) % numPUX;
3016
0
            int relY = (cu.m_cuPelY / alignHeight) % numPUY;
3017
3018
0
            int index = startIdx + (relY * numPUX + relX) + puOffset;
3019
3020
0
            int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
3021
0
            int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
3022
3023
0
            int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;
3024
3025
0
            MEData meData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index];
3026
3027
0
            bestME[0].ref = meData.ref[0];
3028
0
            bestME[1].ref = meData.ref[1];
3029
3030
0
            isBidir = (bestME[0].ref >= 0 && bestME[1].ref >= 0);
3031
0
            uniL0 = (bestME[0].ref >= 0 && bestME[1].ref == REF_NOT_VALID);
3032
0
            uniL1 = (bestME[1].ref >= 0 && bestME[0].ref == REF_NOT_VALID);
3033
3034
0
            if(isBidir)
3035
0
            {
3036
0
                cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc);
3037
0
                cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc);
3038
3039
0
                bidir[0].mv = meData.mv[0];
3040
0
                bidir[1].mv = meData.mv[1];
3041
0
                bidir[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
3042
0
                bidir[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
3043
0
                bidir[0].mvCost = meData.mvCost[0];
3044
0
                bidir[1].mvCost = meData.mvCost[1];
3045
0
                bidirCost = meData.cost;
3046
0
                bidirBits = meData.bits;
3047
3048
0
                bestCost = bidirCost;
3049
0
            }
3050
0
            else if (uniL0)
3051
0
            {
3052
0
                cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc);
3053
3054
0
                bestME[0].mv = meData.mv[0];
3055
0
                bestME[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
3056
0
                bestME[0].mvCost = meData.mvCost[0];
3057
0
                bestME[0].cost = meData.cost;
3058
0
                bestME[0].bits = meData.bits;
3059
3060
0
                bestCost = bestME[0].cost;
3061
0
            }
3062
0
            else if (uniL1)
3063
0
            {
3064
0
                cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc);
3065
3066
0
                bestME[1].mv = meData.mv[1];
3067
0
                bestME[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
3068
0
                bestME[1].mvCost = meData.mvCost[1];
3069
0
                bestME[1].cost = meData.cost;
3070
0
                bestME[1].bits = meData.bits;
3071
3072
0
                bestCost = bestME[1].cost;
3073
0
            }
3074
0
            else
3075
0
                x265_log(NULL, X265_LOG_ERROR, "Invalid ME mode");
3076
3077
0
            if (mrgCost < bestCost)
3078
0
                isMerge = true;
3079
0
        }
3080
3081
        /* select best option and store into CU */
3082
0
        if ((mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) || isMerge)
3083
0
        {
3084
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
3085
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
3086
0
            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
3087
0
            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
3088
0
            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
3089
0
            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
3090
0
            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
3091
3092
0
            totalmebits += merge.bits;
3093
0
        }
3094
0
        else if ((bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) || isBidir)
3095
0
        {
3096
0
            lastMode = 2;
3097
3098
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3099
0
            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
3100
0
            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
3101
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
3102
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
3103
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
3104
3105
0
            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
3106
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
3107
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
3108
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
3109
3110
0
            totalmebits += bidirBits;
3111
0
        }
3112
0
        else if ((bestME[0].cost <= bestME[1].cost) || uniL0)
3113
0
        {
3114
0
            lastMode = 0;
3115
3116
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3117
0
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3118
0
            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
3119
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
3120
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
3121
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
3122
3123
0
            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3124
0
            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
3125
3126
0
            totalmebits += bestME[0].bits;
3127
0
        }
3128
0
        else
3129
0
        {
3130
0
            lastMode = 1;
3131
3132
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3133
0
            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
3134
0
            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
3135
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
3136
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
3137
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
3138
3139
0
            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3140
0
            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
3141
3142
0
            totalmebits += bestME[1].bits;
3143
0
        }
3144
3145
0
        motionCompensation(cu, pu, *predYuv, true, bChromaMC);
3146
0
    }
3147
0
    interMode.sa8dBits += totalmebits;
3148
0
}
3149
3150
#if ENABLE_SCC_EXT
3151
uint32_t Search::getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height)
3152
{
3153
    uint32_t dist = 0;
3154
3155
    for (int i = 0; i < height; i++)
3156
    {
3157
        for (int j = 0; j < width; j++)
3158
        {
3159
            dist += abs(ref[j] - curr[j]);
3160
        }
3161
        ref += refStride;
3162
        curr += currStride;
3163
    }
3164
    return dist;
3165
}
3166
3167
int Search::intraBCSearchMVChromaRefine(Mode& intraBCMode,
3168
    const CUGeom& cuGeom,
3169
    int         roiWidth,
3170
    int         roiHeight,
3171
    int         cuPelX,
3172
    int         cuPelY,
3173
    uint32_t* sadBestCand,
3174
    MV* MVCand,
3175
    uint32_t    partOffset,
3176
    int         puIdx
3177
)
3178
{
3179
    int bestCandIdx = 0;
3180
    uint32_t  sadBest = UINT_MAX;
3181
    uint32_t  tempSad;
3182
3183
    pixel* ref;
3184
    const pixel* picOrg;
3185
    int refStride, orgStride;
3186
    int width, height;
3187
3188
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3189
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3190
3191
    CUData& cu = intraBCMode.cu;
3192
    Yuv& tmpPredYuv = intraBCMode.predYuv;
3193
    PredictionUnit pu(cu, cuGeom, puIdx);
3194
3195
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3196
    {
3197
        if ((!MVCand[cand].x) && (!MVCand[cand].y))
3198
        {
3199
            continue;
3200
        }
3201
3202
        if (((int)(cuPelY + MVCand[cand].y + roiHeight) >= picHeight) || ((cuPelY + MVCand[cand].y) < 0))
3203
        {
3204
            continue;
3205
        }
3206
3207
        if (((int)(cuPelX + MVCand[cand].x + roiWidth) >= picWidth) || ((cuPelX + MVCand[cand].x) < 0))
3208
        {
3209
            continue;
3210
        }
3211
3212
        tempSad = sadBestCand[cand];
3213
        int bitDepths = m_param->sourceBitDepth;
3214
        MV mvQuaterPixl = MVCand[cand];
3215
        mvQuaterPixl <<= 2;
3216
        cu.setPUMv(0, mvQuaterPixl, pu.puAbsPartIdx, puIdx);
3217
        cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
3218
        cu.setPUMv(1, MV(), pu.puAbsPartIdx, puIdx);
3219
        cu.setPURefIdx(1, -1, pu.puAbsPartIdx, puIdx);
3220
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3221
3222
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
3223
3224
        for (uint32_t ch = TEXT_CHROMA_U; ch < MAX_NUM_COMPONENT; ch++)
3225
        {
3226
            ref = m_slice->m_refFrameList[0][m_slice->m_numRefIdx[0] - 1]->m_reconPic[1]->getChromaAddr(ch, cu.m_cuAddr, cu.m_absIdxInCTU + partOffset);
3227
3228
            picOrg = intraBCMode.fencYuv->getChromaAddr(ch, partOffset);
3229
            orgStride = intraBCMode.fencYuv->m_csize;
3230
3231
            refStride = m_frame->m_reconPic[1]->m_strideC;
3232
3233
            width = roiWidth >> m_hChromaShift;
3234
            height = roiHeight >> m_vChromaShift;
3235
3236
            ref = tmpPredYuv.getChromaAddr(ch, partOffset);
3237
            refStride = tmpPredYuv.m_csize;
3238
3239
            for (int row = 0; row < height; row++)
3240
            {
3241
                for (int col = 0; col < width; col++)
3242
                {
3243
                    tempSad += ((abs(ref[col] - picOrg[col])) >> (bitDepths - 8));
3244
                }
3245
                ref += refStride;
3246
                picOrg += orgStride;
3247
            }
3248
        }
3249
3250
        if (tempSad < sadBest)
3251
        {
3252
            sadBest = tempSad;
3253
            bestCandIdx = cand;
3254
        }
3255
    }
3256
3257
    return bestCandIdx;
3258
}
3259
3260
void Search::updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc)
3261
{
3262
    if (roiWidth + roiHeight > 8)
3263
    {
3264
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvCand, CHROMA_REFINEMENT_CANDIDATES, false);
3265
3266
        if (roiWidth + roiHeight == 32)
3267
        {
3268
            ibc.m_numBV16s = ibc.m_numBVs;
3269
        }
3270
    }
3271
}
3272
3273
void Search::intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* MVCand)
3274
{
3275
    int j = CHROMA_REFINEMENT_CANDIDATES - 1;
3276
3277
    if (sad < sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3278
    {
3279
        for (int t = CHROMA_REFINEMENT_CANDIDATES - 1; t >= 0; t--)
3280
        {
3281
            if (sad < sadBestCand[t])
3282
            {
3283
                j = t;
3284
            }
3285
        }
3286
3287
        for (int k = CHROMA_REFINEMENT_CANDIDATES - 1; k > j; k--)
3288
        {
3289
            sadBestCand[k] = sadBestCand[k - 1];
3290
3291
            MVCand[k].set(MVCand[k - 1].x, MVCand[k - 1].y);
3292
        }
3293
        sadBestCand[j] = sad;
3294
        MVCand[j].set(x, y);
3295
    }
3296
}
3297
3298
uint32_t Search::mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel)
3299
{
3300
    for (uint32_t cand = 0; cand < sn && dn < SCM_S0067_NUM_CANDIDATES; cand++)
3301
    {
3302
        bool found = false;
3303
        MV TempMv = src[cand];
3304
        if (!isSrcQuarPel)
3305
        {
3306
            TempMv <<= 2;
3307
        }
3308
        for (uint32_t j = 0; j < dn; j++)
3309
        {
3310
            if (TempMv == dst[j])
3311
            {
3312
                found = true;
3313
                break;
3314
            }
3315
        }
3316
3317
        if (!found)
3318
        {
3319
            dst[dn] = TempMv;
3320
            dn++;
3321
        }
3322
    }
3323
    return dn;
3324
}
3325
3326
void Search::restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours, uint32_t numValidMergeCand)
3327
{
3328
    {
3329
        for (uint32_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
3330
        {
3331
            if (interDirNeighbours[mergeCand] == 3)
3332
            {
3333
                bool b8x8BiPredRestricted = cu->is8x8BipredRestriction(
3334
                    mvFieldNeighbours[mergeCand][0].mv,
3335
                    mvFieldNeighbours[mergeCand][1].mv,
3336
                    mvFieldNeighbours[mergeCand][0].refIdx,
3337
                    mvFieldNeighbours[mergeCand][1].refIdx);
3338
3339
                int width = 0;
3340
                int height = 0;
3341
                uint32_t partAddr;
3342
3343
                cu->getPartIndexAndSize(puIdx, partAddr, width, height);
3344
                if (b8x8BiPredRestricted)
3345
                {
3346
                    if (width <= 8 && height <= 8)
3347
                    {
3348
                        interDirNeighbours[mergeCand] = 1;
3349
                        mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
3350
                    }
3351
                }
3352
                else if (cu->isBipredRestriction())
3353
                {
3354
                    interDirNeighbours[mergeCand] = 1;
3355
                    mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
3356
                }
3357
            }
3358
        }
3359
    }
3360
}
3361
3362
bool Search::isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* cu,
3363
    int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize)
3364
{
3365
    static const int s_floorLog2[65] =
3366
    {
3367
      -1, 0, 1, 1, 2, 2, 2, 2, 3, 3,
3368
       3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
3369
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
3370
       4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
3371
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3372
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3373
       5, 5, 5, 5, 6
3374
    };
3375
3376
    int ctuSizeLog2 = s_floorLog2[ctuSize];
3377
    int interpolationSamplesX = (cu->m_chromaFormat == X265_CSP_I422 || cu->m_chromaFormat == X265_CSP_I420) ? ((xBv & 0x1) << 1) : 0;
3378
    int interpolationSamplesY = (cu->m_chromaFormat == X265_CSP_I420) ? ((yBv & 0x1) << 1) : 0;
3379
    int refRightX = xPos + xBv + width - 1 + interpolationSamplesX;
3380
    int refBottomY = yPos + yBv + height - 1 + interpolationSamplesY;
3381
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3382
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3383
3384
    if ((xPos + xBv - interpolationSamplesX) < 0)
3385
        return false;
3386
    if (refRightX >= picWidth)
3387
        return false;
3388
    if ((yPos + yBv - interpolationSamplesY) < 0)
3389
        return false;
3390
    if (refBottomY >= picHeight)
3391
        return false;
3392
3393
    if ((xBv + width + interpolationSamplesX) > 0 && (yBv + height + interpolationSamplesY) > 0)
3394
        return false;
3395
3396
    if (refBottomY >> ctuSizeLog2 < yPos >> ctuSizeLog2)
3397
    {
3398
        int refCuX = refRightX / ctuSize;
3399
        int refCuY = refBottomY / ctuSize;
3400
        int cuPelX = xPos / ctuSize;
3401
        int cuPelY = yPos / ctuSize;
3402
3403
        if (((int)(refCuX - cuPelX) > (int)((cuPelY - refCuY))))
3404
            return false;
3405
        else
3406
            return true;
3407
    }
3408
3409
    if (refBottomY >> ctuSizeLog2 > yPos >> ctuSizeLog2)
3410
    {
3411
        return false;
3412
    }
3413
3414
    // in the same CTU line
3415
    if (refRightX >> ctuSizeLog2 < xPos >> ctuSizeLog2)
3416
        return true;
3417
    if (refRightX >> ctuSizeLog2 > xPos >> ctuSizeLog2)
3418
        return false;
3419
3420
    // same CTU
3421
    int mask = 1 << ctuSizeLog2;
3422
    mask -= 1;
3423
    int rasterCurr = ((((yPos & mask) - yStartInCU) >> 2) << (ctuSizeLog2 - 2)) + (((xPos & mask) - xStartInCU) >> 2);
3424
    int rasterRef = (((refBottomY & mask) >> 2) << (ctuSizeLog2 - 2)) + ((refRightX & mask) >> 2);
3425
3426
    if (g_rasterToZscan[rasterRef] >= g_rasterToZscan[rasterCurr])
3427
        return false;
3428
    return true;
3429
}
3430
3431
bool Search::isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset)
3432
{
3433
    const int  cuPelX = cu->m_cuPelX + g_zscanToPelX[partOffset];
3434
    const int  cuPelY = cu->m_cuPelY + g_zscanToPelY[partOffset];
3435
3436
    if (!isBlockVectorValid(cuPelX, cuPelY, roiWidth, roiHeight, cu, g_zscanToPelX[partOffset], g_zscanToPelY[partOffset], predX, predY, m_param->maxCUSize))
3437
    {
3438
        return false;
3439
    }
3440
    return true;
3441
}
3442
3443
void Search::intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
3444
    MV& mv, uint32_t& cost, int roiWidth, int roiHeight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3445
{
3446
    const int   srchRngHorLeft = searchRangeLT->x;
3447
    const int   srchRngHorRight = searchRangeRB->x;
3448
    const int   srchRngVerTop = searchRangeLT->y;
3449
    const int   srchRngVerBottom = searchRangeRB->y;
3450
3451
    CUData& cu = intraBCMode.cu;
3452
    const uint32_t  lcuWidth = m_param->maxCUSize;
3453
    const uint32_t  lcuHeight = m_param->maxCUSize;
3454
    const int       puPelOffsetX = g_zscanToPelX[partAddr];
3455
    const int       puPelOffsetY = g_zscanToPelY[partAddr];
3456
    const int       cuPelX = cu.m_cuPelX + puPelOffsetX;  // Point to the location of PU
3457
    const int       cuPelY = cu.m_cuPelY + puPelOffsetY;
3458
3459
    uint32_t  sad = 0;
3460
    uint32_t  sadBest = UINT_MAX;
3461
    int         bestX = 0;
3462
    int         bestY = 0;
3463
    pixel* refSrch;
3464
3465
    int         bestCandIdx = 0;
3466
    uint32_t    partOffset = 0;
3467
    MV          MVCand[CHROMA_REFINEMENT_CANDIDATES];
3468
    uint32_t    sadBestCand[CHROMA_REFINEMENT_CANDIDATES];
3469
3470
    partOffset = partAddr;
3471
    PredictionUnit pu(cu, cuGeom, puIdx);
3472
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3473
    {
3474
        sadBestCand[cand] = UINT_MAX;
3475
        MVCand[cand].set(0, 0);
3476
    }
3477
3478
    const int         relCUPelX = cuPelX % lcuWidth;
3479
    const int         relCUPelY = cuPelY % lcuHeight;
3480
    const int chromaROIWidthInPixels = roiWidth;
3481
    const int chromaROIHeightInPixels = roiHeight;
3482
    bool fastsearch = (m_param->bEnableSCC == 1) ? true : false;
3483
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3484
3485
    if (fastsearch)
3486
    {
3487
        uint32_t tempSadBest = 0;
3488
        int srLeft = srchRngHorLeft, srRight = srchRngHorRight, srTop = srchRngVerTop, srBottom = srchRngVerBottom;
3489
        const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3490
        const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3491
3492
        if (isFullFrameSearchrangeEnabled)//full frame search
3493
        {
3494
            srLeft = -1 * cuPelX;
3495
            srTop = -1 * cuPelY;
3496
3497
            srRight = picWidth - cuPelX - roiWidth;
3498
            srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3499
3500
            if (cuPelX + srRight + roiWidth > (int)picWidth)
3501
            {
3502
                srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3503
            }
3504
            if (cuPelY + srBottom + roiHeight > (int)picHeight)
3505
            {
3506
                srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3507
            }
3508
        }
3509
3510
        if (roiWidth > 8 || roiHeight > 8)
3511
            ibc.m_numBVs = 0;
3512
        else if (roiWidth + roiHeight == 16)
3513
            ibc.m_numBVs = ibc.m_numBV16s;
3514
        if (testOnlyPred)
3515
            ibc.m_numBVs = 0;
3516
3517
        MV  mvPredEncOnly[16];
3518
        int nbPreds = 0;
3519
        cu.getIntraBCMVPsEncOnly(partAddr, mvPredEncOnly, nbPreds, puIdx);
3520
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvPredEncOnly, nbPreds, true);
3521
3522
        for (int cand = 0; cand < ibc.m_numBVs; cand++)
3523
        {
3524
            int xPred = ibc.m_BVs[cand].x >> 2;
3525
            int yPred = ibc.m_BVs[cand].y >> 2;
3526
            if (!(xPred == 0 && yPred == 0) && !((yPred < srTop) || (yPred > srBottom)) && !((xPred < srLeft) || (xPred > srRight)))
3527
            {
3528
                int tempY = yPred + relCUPelY + roiHeight - 1;
3529
                int tempX = xPred + relCUPelX + roiWidth - 1;
3530
                bool validCand = isValidIntraBCSearchArea(&cu, xPred, yPred, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset);
3531
3532
                if ((tempX >= (int)lcuWidth) && (tempY >= 0) && isFullFrameSearchrangeEnabled)
3533
                    validCand = false;
3534
3535
                if ((tempX >= 0) && (tempY >= 0))
3536
                {
3537
                    int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3538
                    uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3539
                    if (tempZscanIdx >= cu.m_absIdxInCTU)
3540
                    {
3541
                        validCand = false;
3542
                    }
3543
                }
3544
3545
                if (validCand)
3546
                {
3547
                    sad = m_me.mvcost(ibc.m_BVs[cand]);
3548
3549
                    refSrch = refY + yPred * refStride + xPred;
3550
3551
                    sad += m_me.bufSAD(refSrch, refStride);
3552
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3553
                    {
3554
                        continue;
3555
                    }
3556
3557
                    intraBCSearchMVCandUpdate(sad, xPred, yPred, sadBestCand, MVCand);
3558
                }
3559
            }
3560
        }
3561
        bestX = MVCand[0].x;
3562
        bestY = MVCand[0].y;
3563
        mv.set(bestX, bestY);
3564
        sadBest = sadBestCand[0];
3565
3566
        if (testOnlyPred)
3567
        {
3568
            cost = sadBest;
3569
            return;
3570
        }
3571
3572
        const int boundY = (0 - roiHeight - puPelOffsetY);
3573
        int lowY = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3574
            ? -cuPelY : X265_MAX(srchRngVerTop, 0 - cuPelY);
3575
        for (int y = boundY; y >= lowY; y--)
3576
        {
3577
            if (!isValidIntraBCSearchArea(&cu, 0, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3578
            {
3579
                continue;
3580
            }
3581
3582
            sad = m_me.mvcost(MV(0, y));
3583
3584
            refSrch = refY + y * refStride;
3585
3586
            sad += m_me.bufSAD(refSrch, refStride);
3587
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3588
            {
3589
                continue;
3590
            }
3591
3592
            intraBCSearchMVCandUpdate(sad, 0, y, sadBestCand, MVCand);
3593
            tempSadBest = sadBestCand[0];
3594
            if (sadBestCand[0] <= 3)
3595
            {
3596
                bestX = MVCand[0].x;
3597
                bestY = MVCand[0].y;
3598
                sadBest = sadBestCand[0];
3599
                mv.set(bestX, bestY);
3600
                cost = sadBest;
3601
3602
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3603
                return;
3604
            }
3605
        }
3606
3607
        const int boundX = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3608
            ? -cuPelX : X265_MAX(srchRngHorLeft, -cuPelX);
3609
        for (int x = 0 - roiWidth - puPelOffsetX; x >= boundX; --x)
3610
        {
3611
            if (!isValidIntraBCSearchArea(&cu, x, 0, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3612
            {
3613
                continue;
3614
            }
3615
3616
            sad = m_me.mvcost(MV(x, 0));
3617
3618
            refSrch = refY + x;
3619
            sad += m_me.bufSAD(refSrch, refStride);
3620
3621
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3622
            {
3623
                continue;
3624
            }
3625
3626
            intraBCSearchMVCandUpdate(sad, x, 0, sadBestCand, MVCand);
3627
            tempSadBest = sadBestCand[0];
3628
            if (sadBestCand[0] <= 3)
3629
            {
3630
                bestX = MVCand[0].x;
3631
                bestY = MVCand[0].y;
3632
                sadBest = sadBestCand[0];
3633
                mv.set(bestX, bestY);
3634
                cost = sadBest;
3635
3636
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3637
                return;
3638
            }
3639
        }
3640
3641
        bestX = MVCand[0].x;
3642
        bestY = MVCand[0].y;
3643
        sadBest = sadBestCand[0];
3644
3645
        if ((!bestX && !bestY) || (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 32))
3646
        {
3647
            //chroma refine
3648
            bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3649
            bestX = MVCand[bestCandIdx].x;
3650
            bestY = MVCand[bestCandIdx].y;
3651
            sadBest = sadBestCand[bestCandIdx];
3652
            mv.set(bestX, bestY);
3653
            cost = sadBest;
3654
3655
            updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3656
            return;
3657
        }
3658
3659
        if (cuGeom.depth > 2 && !bUse1DSearchFor8x8)
3660
        {
3661
            for (int y = X265_MAX(srchRngVerTop, -cuPelY); y <= srchRngVerBottom; y += 2)
3662
            {
3663
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3664
                {
3665
                    continue;
3666
                }
3667
3668
                int tempY = y + relCUPelY + roiHeight - 1;
3669
3670
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x++)
3671
                {
3672
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3673
                    {
3674
                        continue;
3675
                    }
3676
3677
                    int tempX = x + relCUPelX + roiWidth - 1;
3678
3679
                    if ((tempX >= 0) && (tempY >= 0))
3680
                    {
3681
                        int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3682
                        uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3683
                        if (iTempZscanIdx >= cu.m_absIdxInCTU)
3684
                        {
3685
                            continue;
3686
                        }
3687
                    }
3688
3689
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3690
                    {
3691
                        continue;
3692
                    }
3693
3694
                    sad = m_me.mvcost(MV(x, y));
3695
3696
                    refSrch = refY + y * refStride + x;
3697
                    sad += m_me.bufSAD(refSrch, refStride);
3698
3699
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3700
                }
3701
            }
3702
3703
            bestX = MVCand[0].x;
3704
            bestY = MVCand[0].y;
3705
            sadBest = sadBestCand[0];
3706
            if (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 16)
3707
            {
3708
                //chroma refine
3709
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3710
                bestX = MVCand[bestCandIdx].x;
3711
                bestY = MVCand[bestCandIdx].y;
3712
                sadBest = sadBestCand[bestCandIdx];
3713
                mv.set(bestX, bestY);
3714
                cost = sadBest;
3715
3716
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3717
                return;
3718
            }
3719
3720
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3721
            {
3722
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3723
                {
3724
                    continue;
3725
                }
3726
3727
                int tempY = y + relCUPelY + roiHeight - 1;
3728
3729
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x += 2)
3730
                {
3731
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3732
                    {
3733
                        continue;
3734
                    }
3735
3736
                    int tempX = x + relCUPelX + roiWidth - 1;
3737
3738
                    if ((tempX >= 0) && (tempY >= 0))
3739
                    {
3740
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3741
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3742
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3743
                        {
3744
                            continue;
3745
                        }
3746
                    }
3747
3748
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3749
                    {
3750
                        continue;
3751
                    }
3752
3753
                    sad = m_me.mvcost(MV(x, y));
3754
3755
                    refSrch = refY + y * refStride + x;
3756
                    sad += m_me.bufSAD(refSrch, refStride);
3757
3758
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3759
                    {
3760
                        continue;
3761
                    }
3762
3763
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3764
                    if (sadBestCand[0] <= 5)
3765
                    {
3766
                        //chroma refine & return
3767
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3768
                        bestX = MVCand[bestCandIdx].x;
3769
                        bestY = MVCand[bestCandIdx].y;
3770
                        sadBest = sadBestCand[bestCandIdx];
3771
                        mv.set(bestX, bestY);
3772
                        cost = sadBest;
3773
3774
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3775
                        return;
3776
                    }
3777
                }
3778
            }
3779
3780
            bestX = MVCand[0].x;
3781
            bestY = MVCand[0].y;
3782
            sadBest = sadBestCand[0];
3783
3784
            if ((sadBest >= tempSadBest) || ((sadBest - m_me.mvcost(MV(bestX, bestY))) <= 32))
3785
            {
3786
                //chroma refine
3787
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3788
                bestX = MVCand[bestCandIdx].x;
3789
                bestY = MVCand[bestCandIdx].y;
3790
                sadBest = sadBestCand[bestCandIdx];
3791
                mv.set(bestX, bestY);
3792
                cost = sadBest;
3793
3794
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3795
                return;
3796
            }
3797
3798
            tempSadBest = sadBestCand[0];
3799
3800
3801
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3802
            {
3803
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3804
                {
3805
                    continue;
3806
                }
3807
3808
                int tempY = y + relCUPelY + roiHeight - 1;
3809
3810
                for (int x = (X265_MAX(srchRngHorLeft, -cuPelX) + 1); x <= srchRngHorRight; x += 2)
3811
                {
3812
3813
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3814
                    {
3815
                        continue;
3816
                    }
3817
3818
                    int tempX = x + relCUPelX + roiWidth - 1;
3819
3820
                    if ((tempX >= 0) && (tempY >= 0))
3821
                    {
3822
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3823
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3824
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3825
                        {
3826
                            continue;
3827
                        }
3828
                    }
3829
3830
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3831
                    {
3832
                        continue;
3833
                    }
3834
3835
                    sad = m_me.mvcost(MV(x, y));
3836
3837
                    refSrch = refY + y * refStride + x;
3838
                    sad += m_me.bufSAD(refSrch, refStride);
3839
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3840
                    {
3841
                        continue;
3842
                    }
3843
3844
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3845
                    if (sadBestCand[0] <= 5)
3846
                    {
3847
                        //chroma refine & return
3848
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3849
                        bestX = MVCand[bestCandIdx].x;
3850
                        bestY = MVCand[bestCandIdx].y;
3851
                        sadBest = sadBestCand[bestCandIdx];
3852
                        mv.set(bestX, bestY);
3853
                        cost = sadBest;
3854
3855
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3856
                        return;
3857
                    }
3858
                }
3859
            }
3860
        }
3861
    }
3862
    else //full search
3863
    {
3864
        refY += (srchRngVerBottom * refStride);
3865
        int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3866
        int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3867
3868
        for (int y = srchRngVerBottom; y >= srchRngVerTop; y--)
3869
        {
3870
            if (((int)(cuPelY + y) < 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3871
            {
3872
                refY -= refStride;
3873
                continue;
3874
            }
3875
3876
            for (int x = srchRngHorLeft; x <= srchRngHorRight; x++)
3877
            {
3878
3879
                if (((int)(cuPelX + x) < 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3880
                {
3881
                    continue;
3882
                }
3883
3884
                int tempX = x + relCUPelX + roiWidth - 1;
3885
                int tempY = y + relCUPelY + roiHeight - 1;
3886
                if ((tempX >= 0) && (tempY >= 0))
3887
                {
3888
                    int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3889
                    uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3890
                    if (iTempZscanIdx >= cu.m_absIdxInCTU)
3891
                    {
3892
                        continue;
3893
                    }
3894
                }
3895
3896
                if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3897
                {
3898
                    continue;
3899
                }
3900
3901
                refSrch = refY + x;
3902
3903
                sad = m_me.bufSAD(refSrch, refStride);
3904
                sad += m_me.mvcost(MV(x, y));
3905
                if (sad < sadBest)
3906
                {
3907
                    sadBest = sad;
3908
                    bestX = x;
3909
                    bestY = y;
3910
                }
3911
                intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3912
            }
3913
3914
            refY -= refStride;
3915
        }
3916
    }
3917
3918
    bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3919
    bestX = MVCand[bestCandIdx].x;
3920
    bestY = MVCand[bestCandIdx].y;
3921
    sadBest = sadBestCand[bestCandIdx];
3922
    mv.set(bestX, bestY);
3923
    cost = sadBest;
3924
3925
    updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3926
3927
}
3928
3929
void Search::setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB)
3930
{
3931
    MV mvPred = pred;
3932
    CUData& cu = intraBCMode.cu;
3933
    cu.clipMv(mvPred);
3934
    int srLeft, srRight, srTop, srBottom;
3935
    int puWidth, puHeight;
3936
    uint32_t partAddr;
3937
3938
    cu.getPartIndexAndSize(puIdx, partAddr, puWidth, puHeight);
3939
3940
    const uint32_t lcuWidth = m_param->maxCUSize;
3941
    const uint32_t lcuHeight = m_param->maxCUSize;
3942
    const uint32_t cuPelX = cu.m_cuPelX + g_zscanToPelX[partAddr];
3943
    const uint32_t cuPelY = cu.m_cuPelY + g_zscanToPelY[partAddr];
3944
3945
    const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3946
    const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3947
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3948
    if (1 << cu.m_log2CUSize[0] == 16 && cu.m_partSize[0] == SIZE_2Nx2N && isFullFrameSearchrangeEnabled)// full frame search
3949
    {
3950
        srLeft = -1 * cuPelX;
3951
        srTop = -1 * cuPelY;
3952
3953
        srRight = picWidth - cuPelX - roiWidth;
3954
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3955
    }
3956
    else
3957
    {
3958
        const uint32_t searchWidthInCTUs = 1 << cu.m_log2CUSize[0] == 8 ? 1 : (isFullFrameSearchrangeEnabled) ? -1 : 1;
3959
        uint32_t width = 0, maxWidth = searchWidthInCTUs * lcuWidth;
3960
        for (const CUData* pTestCU = cu.m_cuLeft;
3961
            width < maxWidth && pTestCU != NULL && pTestCU->m_slice != NULL;
3962
            pTestCU = pTestCU->m_cuLeft, width += lcuWidth)
3963
        {
3964
        }
3965
        int maxXsr = (cuPelX % lcuWidth) + X265_MIN(maxWidth, width);
3966
        int maxYsr = cuPelY % lcuHeight;
3967
3968
        if (cu.m_chromaFormat == X265_CSP_I420 || cu.m_chromaFormat == X265_CSP_I422) maxXsr &= ~0x4;
3969
        if (cu.m_chromaFormat == X265_CSP_I420)                                       maxYsr &= ~0x4;
3970
3971
        srLeft = -maxXsr;
3972
        srTop = -maxYsr;
3973
3974
        srRight = lcuWidth - cuPelX % lcuWidth - roiWidth;
3975
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3976
    }
3977
3978
    if (cuPelX + srRight + roiWidth > picWidth)
3979
    {
3980
        srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3981
    }
3982
    if (cuPelY + srBottom + roiHeight > picHeight)
3983
    {
3984
        srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3985
    }
3986
3987
    searchRangeLT.x = srLeft;
3988
    searchRangeLT.y = srTop;
3989
    searchRangeRB.x = srRight;
3990
    searchRangeRB.y = srBottom;
3991
3992
    cu.clipMv(searchRangeLT);
3993
    cu.clipMv(searchRangeRB);
3994
3995
}
3996
3997
void Search::intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3998
{
3999
    uint32_t         partAddr;
4000
    int              roiWidth;
4001
    int              roiHeight;
4002
4003
    MV   searchRangeLT;
4004
    MV   searchRangeRB;
4005
    MV   mvPred = *pred;
4006
    const MV predictors = *pred;
4007
4008
    CUData& cu = intraBCMode.cu;
4009
    cu.getPartIndexAndSize(puIdx, partAddr, roiWidth, roiHeight);
4010
4011
    int ref = m_slice->m_numRefIdx[0] - 1;
4012
    pixel* refY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->getLumaAddr(cu.m_cuAddr, cu.m_absIdxInCTU + partAddr);
4013
    int  strideY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->m_stride;
4014
4015
    setIntraSearchRange(intraBCMode, mvPred, puIdx, roiWidth, roiHeight, searchRangeLT, searchRangeRB);
4016
4017
    m_me.setMVP(predictors);
4018
4019
    intraPatternSearch(intraBCMode, cuGeom, puIdx, partAddr, refY, strideY, &searchRangeLT, &searchRangeRB, mv, cost, roiWidth, roiHeight, testOnlyPred, bUse1DSearchFor8x8, ibc);
4020
}
4021
4022
bool Search::predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
4023
{
4024
    MV zeroMv(0, 0);
4025
    CUData& cu = intraBCMode.cu;
4026
    Yuv* predYuv = &intraBCMode.predYuv;
4027
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
4028
    int  numPart = cu.getNumPartInter(0);
4029
    int log2ParallelMergeLevelMinus2 = 0;
4030
4031
    // 12 mv candidates including lowresMV
4032
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
4033
4034
    if (m_param->bEnableSCC == 1 && (1 << cu.m_log2CUSize[0]) > SCM_S0067_MAX_CAND_SIZE) // fast search
4035
        return false;
4036
4037
    uint32_t totalCost = 0;
4038
    for (int puIdx = 0; puIdx < numPart; puIdx++)
4039
    {
4040
        int width, height;
4041
        uint32_t partAddr = 0;
4042
        MotionData* bestME = intraBCMode.bestME[puIdx];
4043
        PredictionUnit pu(cu, cuGeom, puIdx);
4044
        MV  mv, mvPred[2];
4045
        cu.getPartIndexAndSize(puIdx, pu.puAbsPartIdx, width, height);
4046
        partAddr = pu.puAbsPartIdx;
4047
        m_me.setSourcePU(*intraBCMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
4048
4049
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, intraBCMode.interNeighbours);
4050
        cu.getPMV(intraBCMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, puIdx, pu.puAbsPartIdx);
4051
4052
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
4053
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
4054
4055
        uint32_t cost;
4056
        mv.set(0, 0);
4057
        intraBlockCopyEstimate(intraBCMode, cuGeom, puIdx, mvPred, mv, cost, testOnlyPred, bUse1DSearchFor8x8, ibc);
4058
4059
        bestME->mv.set(mv.x << 2, mv.y << 2);
4060
        bestME->cost = cost;
4061
        totalCost += cost;
4062
        if (mv.x == 0 && mv.y == 0)
4063
        {
4064
            if (testOnlyPred)
4065
            {
4066
                m_lastCandCost = MAX_UINT;
4067
            }
4068
            return false;
4069
        }
4070
4071
        int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
4072
        int distAMVPBest, distMergeTemp;
4073
        int costAMVPBest, costMergeBest, costMergeTemp;
4074
        bitsAMVPBest = MAX_INT;
4075
        costAMVPBest = MAX_INT;
4076
        costMergeBest = MAX_INT;
4077
        int mvpIdxBest = 0;
4078
        int mvpIdxTemp;
4079
        int mrgIdxBest = -1;
4080
        int mrgIdxTemp = -1;
4081
        int xCUStart = cu.m_cuPelX;
4082
        int yCUStart = cu.m_cuPelY;
4083
        int xStartInCU = 0, yStartInCU = 0;
4084
        if (ePartSize == SIZE_2Nx2N)
4085
            xStartInCU = yStartInCU = 0;
4086
        else if (ePartSize == SIZE_2NxN)
4087
        {
4088
            xStartInCU = 0;
4089
            yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
4090
        }
4091
        else if (ePartSize == SIZE_Nx2N)
4092
        {
4093
            xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
4094
            yStartInCU = 0;
4095
        }
4096
        const pixel* currStart;
4097
        pixel* ref;
4098
        int currStride, refStride;
4099
        distAMVPBest = 0;
4100
4101
        MV cMvQuaterPixl = mv;
4102
        cMvQuaterPixl <<= 2;
4103
        cu.setPUMv(0, cMvQuaterPixl, pu.puAbsPartIdx, puIdx);
4104
        cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
4105
        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
4106
        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
4107
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
4108
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4109
        for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4110
        {
4111
            int tempHeight, tempWidth;
4112
            if (ch == 0)
4113
            {
4114
                tempHeight = height;
4115
                tempWidth = width;
4116
                ref = tmpPredYuv.getLumaAddr(partAddr);
4117
                refStride = tmpPredYuv.m_size;
4118
                distAMVPBest += m_me.bufSAD(ref, refStride);
4119
            }
4120
            else
4121
            {
4122
                tempHeight = height >> m_vChromaShift;
4123
                tempWidth = width >> m_hChromaShift;
4124
4125
                currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
4126
                currStride = intraBCMode.fencYuv->m_csize;
4127
                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4128
                refStride = tmpPredYuv.m_csize;
4129
                distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4130
            }
4131
        }
4132
4133
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
4134
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
4135
4136
        for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4137
        {
4138
            m_me.setMVP(mvPred[mvpIdxTemp]);
4139
            bitsAMVPTemp = m_me.bitcost(mv, mvPred[mvpIdxTemp]);
4140
            if (bitsAMVPTemp < bitsAMVPBest)
4141
            {
4142
                bitsAMVPBest = bitsAMVPTemp;
4143
                mvpIdxBest = mvpIdxTemp;
4144
            }
4145
        }
4146
4147
        bitsAMVPBest++; // for MVP Index bits
4148
        costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4149
4150
        MVField cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4151
        uint8_t uhInterDirNeighbours[MRG_MAX_NUM_CANDS];
4152
        int numValidMergeCand = 0;
4153
4154
        for (int i = 0; i < MRG_MAX_NUM_CANDS; i++)
4155
        {
4156
            cMvFieldNeighbours[i][0].mv.set(0, 0);
4157
            cMvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
4158
        }
4159
4160
        if (ePartSize != SIZE_2Nx2N)
4161
        {
4162
            if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4163
            {
4164
                cu.setPartSizeSubParts(SIZE_2Nx2N);
4165
                if (puIdx == 0)
4166
                {
4167
                    numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, uhInterDirNeighbours);
4168
                }
4169
                cu.setPartSizeSubParts(ePartSize);
4170
            }
4171
            else
4172
            {
4173
                numValidMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, cMvFieldNeighbours, uhInterDirNeighbours);
4174
            }
4175
4176
            cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand);
4177
            restrictBipredMergeCand(&cu, puIdx, cMvFieldNeighbours, uhInterDirNeighbours, numValidMergeCand);
4178
4179
            for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCand; mrgIdxTemp++)
4180
            {
4181
                if (uhInterDirNeighbours[mrgIdxTemp] != 1)
4182
                {
4183
                    continue;
4184
                }
4185
                if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4186
                {
4187
                    continue;
4188
                }
4189
4190
                if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, width, height, &cu,
4191
                    xStartInCU, yStartInCU, (cMvFieldNeighbours[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighbours[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4192
                {
4193
                    continue;
4194
                }
4195
                bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4196
4197
                distMergeTemp = 0;
4198
4199
                cu.setPUMv(0, cMvFieldNeighbours[mrgIdxTemp][0].mv, pu.puAbsPartIdx, puIdx);
4200
                cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, puIdx);
4201
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
4202
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
4203
                cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
4204
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4205
4206
                for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4207
                {
4208
                    int tempHeight, tempWidth;
4209
                    if (ch == 0)
4210
                    {
4211
                        tempHeight = height;
4212
                        tempWidth = width;
4213
                        ref = tmpPredYuv.getLumaAddr(partAddr);
4214
                        refStride = tmpPredYuv.m_size;
4215
                        distMergeTemp += m_me.bufSAD(ref, refStride);
4216
                    }
4217
                    else
4218
                    {
4219
                        tempHeight = height >> m_vChromaShift;
4220
                        tempWidth = width >> m_hChromaShift;
4221
4222
                        currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
4223
                        currStride = intraBCMode.fencYuv->m_csize;
4224
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4225
                        refStride = tmpPredYuv.m_csize;
4226
                        distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4227
                    }
4228
                }
4229
                costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4230
4231
                if (costMergeTemp < costMergeBest)
4232
                {
4233
                    costMergeBest = costMergeTemp;
4234
                    mrgIdxBest = mrgIdxTemp;
4235
                }
4236
            }
4237
        }
4238
        if (costAMVPBest < costMergeBest)
4239
        {
4240
            MV tempmv((mv.x << 2), (mv.y << 2));
4241
            MVField mvField[2];
4242
            mvField[0].mv = tempmv;
4243
            mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4244
            mvField[1].mv = zeroMv;
4245
            mvField[1].refIdx = REF_NOT_VALID;
4246
4247
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
4248
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
4249
4250
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
4251
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
4252
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
4253
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
4254
4255
            MV mvd;
4256
            mvd.set(mv.x - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].x >> 2), mv.y - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].y >> 2));
4257
4258
            cu.m_mvd[0][pu.puAbsPartIdx] = mvd;
4259
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mvpIdxBest;
4260
        }
4261
        else
4262
        {
4263
            MV MV(cMvFieldNeighbours[mrgIdxBest][0].mv.x, cMvFieldNeighbours[mrgIdxBest][0].mv.y);
4264
            MVField mvField[2];
4265
            mvField[0].mv = MV;
4266
            mvField[0].refIdx = cu.m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4267
            mvField[1].mv = zeroMv;
4268
            mvField[1].refIdx = REF_NOT_VALID;
4269
4270
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4271
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mrgIdxBest; /* merge candidate ID is stored in L0 MVP idx */
4272
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
4273
4274
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
4275
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
4276
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
4277
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
4278
4279
            cu.m_mvd[0][pu.puAbsPartIdx] = zeroMv;
4280
            cu.m_mvd[1][pu.puAbsPartIdx] = zeroMv;
4281
        }
4282
        motionCompensation(cu, pu, *predYuv, 1, 1);
4283
    }
4284
4285
    PredictionUnit pu(cu, cuGeom, 0);
4286
    uint32_t abortThreshold = (1 << cu.m_log2CUSize[0]) * (1 << cu.m_log2CUSize[0]) * 2;
4287
    if (testOnlyPred)
4288
    {
4289
        if (numPart == 1 && totalCost > abortThreshold)
4290
        {
4291
            m_lastCandCost = MAX_UINT;
4292
            return false;
4293
        }
4294
        m_lastCandCost = totalCost;
4295
    }
4296
    else if (totalCost < abortThreshold && 3 * totalCost >> 2 >= m_lastCandCost)
4297
    {
4298
        return false;
4299
    }
4300
    return true;
4301
}
4302
4303
bool Search::predMixedIntraBCInterSearch(Mode& intraBCMixedMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMvCandList)
4304
{
4305
    intraBCMixedMode.initCosts();
4306
    intraBCMixedMode.cu.setPartSizeSubParts(ePartSize);
4307
    intraBCMixedMode.cu.setPredModeSubParts(MODE_INTER);
4308
    CUData& cu = intraBCMixedMode.cu;
4309
    int numComb = 2;
4310
    int numPart = 2;
4311
    uint32_t cost[2] = { 0,0 };
4312
    uint32_t maxCost = UINT32_MAX;
4313
4314
    int      numPredDir = m_slice->isInterP() ? 1 : 2;
4315
    MV       cMvZero(0, 0);
4316
4317
    MV  cMvPredCand[2][2];
4318
    int IBCValidFlag = 0;
4319
    int bestIBCMvpIdx[2] = { 0, 0 };
4320
    int bestInterMvpIdx[2] = { 0, 0 };
4321
    int bestInterDir[2] = { 0, 0 };
4322
    int bestRefIdx[2] = { 0, 0 };
4323
    bool isMergeMode[2] = { false, false };
4324
    bool isIBCMergeMode[2] = { false, false };
4325
    MVField cMRGMvField[2][2];
4326
    MVField cMRGMvFieldIBC[2][2];
4327
    int log2ParallelMergeLevelMinus2 = 0;
4328
    // 12 mv candidates including lowresMV
4329
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
4330
4331
    Yuv* predYuv = &intraBCMixedMode.predYuv;
4332
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
4333
4334
    for (int combo = 0; combo < numComb; combo++) // number of combination
4335
    {
4336
        for (int partIdx = 0; partIdx < numPart; ++partIdx)
4337
        {
4338
            int dummyWidth, dummyHeight;
4339
            uint32_t partAddr = 0;
4340
            PredictionUnit pu(cu, cuGeom, partIdx);
4341
            cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth, dummyHeight);
4342
            m_me.setSourcePU(*intraBCMixedMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
4343
4344
            MV mvPred[2];
4345
            MV bvPred[2];
4346
            if ((combo == 0 && partIdx == 0) || (combo == 1 && partIdx == 1)) // intraBC
4347
            {
4348
                MV cMv = iMvCandList[8 + partIdx];
4349
                if (cMv.x == 0 && cMv.y == 0)
4350
                {
4351
                    cost[combo] = maxCost;
4352
                    IBCValidFlag++;
4353
                    break;
4354
                }
4355
4356
                cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4357
                cu.getPMV(intraBCMixedMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, partIdx, pu.puAbsPartIdx);
4358
4359
                bvPred[0] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0];
4360
                bvPred[1] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1];
4361
                bvPred[0] >>= 2;
4362
                bvPred[1] >>= 2;
4363
4364
                /////////////////////////////////////////////////////////////
4365
                // ibc merge
4366
                // choose one MVP and compare with merge mode
4367
4368
                int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
4369
                int distAMVPBest, distMergeTemp;
4370
                int costAMVPBest, costMergeBest, costMergeTemp;
4371
                bitsAMVPBest = MAX_INT;
4372
                costAMVPBest = MAX_INT;
4373
                costMergeBest = MAX_INT;
4374
                int mvpIdxBest = 0;
4375
                int mvpIdxTemp;
4376
                int mrgIdxBest = -1;
4377
                int mrgIdxTemp = -1;
4378
                int xCUStart = cu.m_cuPelX;
4379
                int yCUStart = cu.m_cuPelY;
4380
                int xStartInCU = 0, yStartInCU = 0;
4381
                if (ePartSize == SIZE_2Nx2N)
4382
                    xStartInCU = yStartInCU = 0;
4383
                else if (ePartSize == SIZE_2NxN)
4384
                {
4385
                    xStartInCU = 0;
4386
                    yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
4387
                }
4388
                else if (ePartSize == SIZE_Nx2N)
4389
                {
4390
                    xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
4391
                    yStartInCU = 0;
4392
                }
4393
                const pixel* currStart;
4394
                int currStride;
4395
                int refStride;
4396
                distAMVPBest = 0;
4397
                pixel* ref;
4398
4399
                cu.setPUMv(0, cMv, pu.puAbsPartIdx, partIdx);
4400
                cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, partIdx);
4401
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4402
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4403
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4404
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4405
4406
                for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4407
                {
4408
                    int tempHeight, tempWidth;
4409
                    if (ch == 0)
4410
                    {
4411
                        tempHeight = dummyHeight;
4412
                        tempWidth = dummyWidth;
4413
                        ref = tmpPredYuv.getLumaAddr(partAddr);
4414
                        refStride = tmpPredYuv.m_size;
4415
                        distAMVPBest += m_me.bufSAD(ref, refStride);
4416
                    }
4417
                    else
4418
                    {
4419
                        tempHeight = dummyHeight >> m_vChromaShift;
4420
                        tempWidth = dummyWidth >> m_hChromaShift;
4421
4422
                        currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4423
                        currStride = intraBCMixedMode.fencYuv->m_csize;
4424
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4425
                        refStride = tmpPredYuv.m_csize;
4426
                        distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4427
                    }
4428
                }
4429
4430
                MV check;
4431
                for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4432
                {
4433
                    m_me.setMVP(bvPred[mvpIdxTemp]);
4434
                    bitsAMVPTemp = m_me.bitcost(cMv >> 2, bvPred[mvpIdxTemp]);
4435
                    if (bitsAMVPTemp < bitsAMVPBest)
4436
                    {
4437
                        bitsAMVPBest = bitsAMVPTemp;
4438
                        mvpIdxBest = mvpIdxTemp;
4439
                    }
4440
                }
4441
4442
                bitsAMVPBest++; // for MVP Index bits
4443
                costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4444
4445
                MVField cMvFieldNeighboursIBC[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4446
                uint8_t uhInterDirNeighboursIBC[MRG_MAX_NUM_CANDS];
4447
                int numValidMergeCandIBC = 0;
4448
4449
                if (ePartSize != SIZE_2Nx2N)
4450
                {
4451
                    if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4452
                    {
4453
                        cu.setPartSizeSubParts(SIZE_2Nx2N);
4454
                        if (partIdx == 0)
4455
                        {
4456
                            numValidMergeCandIBC = cu.getInterMergeCandidates(0, 0, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4457
                        }
4458
                        cu.setPartSizeSubParts(ePartSize);
4459
                    }
4460
                    else
4461
                    {
4462
                        numValidMergeCandIBC = cu.getInterMergeCandidates(pu.puAbsPartIdx, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4463
                    }
4464
4465
                    cu.roundMergeCandidates(cMvFieldNeighboursIBC, numValidMergeCandIBC);
4466
                    restrictBipredMergeCand(&cu, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC, numValidMergeCandIBC);
4467
4468
                    for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCandIBC; mrgIdxTemp++)
4469
                    {
4470
                        if (uhInterDirNeighboursIBC[mrgIdxTemp] != 1)
4471
                        {
4472
                            continue;
4473
                        }
4474
                        if (m_slice->m_refPOCList[0][cMvFieldNeighboursIBC[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4475
                        {
4476
                            continue;
4477
                        }
4478
4479
                        if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, dummyWidth, dummyHeight, &cu,
4480
                            xStartInCU, yStartInCU, (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4481
                        {
4482
                            continue;
4483
                        }
4484
                        bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4485
4486
                        distMergeTemp = 0;
4487
                        cu.setPUMv(0, cMvFieldNeighboursIBC[mrgIdxTemp][0].mv, pu.puAbsPartIdx, partIdx);
4488
                        cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4489
                        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4490
                        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4491
                        cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4492
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4493
4494
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4495
                        {
4496
                            int tempHeight, tempWidth;
4497
                            if (ch == 0)
4498
                            {
4499
                                tempHeight = dummyHeight;
4500
                                tempWidth = dummyWidth;
4501
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4502
                                refStride = tmpPredYuv.m_size;
4503
                                distMergeTemp += m_me.bufSAD(ref, refStride);
4504
                            }
4505
                            else
4506
                            {
4507
                                tempHeight = dummyHeight >> m_vChromaShift;
4508
                                tempWidth = dummyWidth >> m_hChromaShift;
4509
4510
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4511
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4512
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4513
                                refStride = tmpPredYuv.m_csize;
4514
                                distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4515
                            }
4516
                        }
4517
                        costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4518
4519
                        if (costMergeTemp < costMergeBest)
4520
                        {
4521
                            costMergeBest = costMergeTemp;
4522
                            mrgIdxBest = mrgIdxTemp;
4523
                        }
4524
                    }
4525
                }
4526
4527
                if (costMergeBest < costAMVPBest)
4528
                {
4529
                    cost[combo] += costMergeBest;
4530
                    isIBCMergeMode[combo] = true;
4531
                    bestIBCMvpIdx[combo] = mrgIdxBest;
4532
4533
                    MVField mvField[2];
4534
                    MV mv(cMvFieldNeighboursIBC[mrgIdxBest][0].mv.x, cMvFieldNeighboursIBC[mrgIdxBest][0].mv.y);
4535
                    mvField[0].mv = mv;
4536
                    mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4537
                    mvField[1].mv = cMvZero;
4538
                    mvField[1].refIdx = REF_NOT_VALID;
4539
                    cMRGMvFieldIBC[combo][0] = mvField[0];
4540
                    cMRGMvFieldIBC[combo][1] = mvField[1];
4541
                }
4542
                else
4543
                {
4544
                    cost[combo] += costAMVPBest;
4545
                    isIBCMergeMode[combo] = false;
4546
                    bestIBCMvpIdx[combo] = mvpIdxBest;
4547
                    cMvPredCand[combo][partIdx].set(bvPred[mvpIdxBest].x << 2, bvPred[mvpIdxBest].y << 2);
4548
                }
4549
4550
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);  // list 0 prediction
4551
                if (isIBCMergeMode[combo])
4552
                {
4553
                    cu.setPUMv(0, cMRGMvFieldIBC[combo][0].mv, pu.puAbsPartIdx, partIdx);
4554
                }
4555
                else
4556
                {
4557
                    cu.setPUMv(0, iMvCandList[8 + partIdx], pu.puAbsPartIdx, partIdx);
4558
                    cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4559
                    cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4560
                }
4561
                // ibc merge
4562
                /////////////////////////////////////////////////////////////
4563
            }
4564
            else // is inter PU
4565
            {
4566
                uint32_t  costInterTemp = 0;
4567
                uint32_t  costInterBest = UINT32_MAX;
4568
                const pixel* currStart;
4569
                int currStride;
4570
                pixel* ref;
4571
                int refStride;
4572
                MergeData merge;
4573
                memset(&merge, 0, sizeof(merge));
4574
                for (int refList = 0; refList < numPredDir; refList++)
4575
                {
4576
                    uint32_t numRef = refList ? ((m_slice->m_numRefIdx[1] > 1) ? 2 : 1) : ((m_slice->m_numRefIdx[0] - 1 > 1) ? 2 : 1);
4577
                    for (uint32_t refIdx = 0; refIdx < numRef; refIdx++)
4578
                    {
4579
                        MV cMv = iMvCandList[4 * refList + 2 * refIdx + partIdx];
4580
4581
                        cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4582
                        cu.getPMV(intraBCMixedMode.interNeighbours, refList, refIdx, intraBCMixedMode.amvpCand[refList][refIdx], mvc, partIdx, pu.puAbsPartIdx);
4583
                        int mvpIdx;
4584
4585
                        uint32_t  tempCost0 = 0;
4586
                        uint32_t  tempCost1 = 0;
4587
                        mvPred[0] = intraBCMixedMode.amvpCand[refList][refIdx][0];
4588
                        mvPred[1] = intraBCMixedMode.amvpCand[refList][refIdx][1];
4589
4590
                        m_me.setMVP(mvPred[0]);
4591
                        tempCost0 = m_me.bitcost(cMv, mvPred[0]);
4592
                        m_me.setMVP(mvPred[1]);
4593
                        tempCost1 = m_me.bitcost(cMv, mvPred[1]);
4594
                        if (tempCost1 < tempCost0)
4595
                        {
4596
                            mvpIdx = 1;
4597
                        }
4598
                        else
4599
                        {
4600
                            mvpIdx = 0;
4601
                        }
4602
                        uint32_t bitsTemp = m_listSelBits[refList] + MVP_IDX_BITS;
4603
                        bitsTemp += getTUBits(refIdx, numRef);
4604
4605
                        m_me.setMVP(mvPred[mvpIdx]);
4606
                        if (cu.m_slice->m_useIntegerMv)
4607
                        {
4608
                            cu.setPUMv(refList, (cMv >> 2) << 2, pu.puAbsPartIdx, partIdx);
4609
                        }
4610
                        else
4611
                        {
4612
                            cu.setPUMv(refList, cMv, pu.puAbsPartIdx, partIdx);
4613
                        }
4614
                        cu.setPURefIdx(refList, refIdx, pu.puAbsPartIdx, partIdx);
4615
                        cu.setPUInterDir(1 + refList, pu.puAbsPartIdx, partIdx);
4616
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4617
4618
                        costInterTemp = 0;
4619
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4620
                        {
4621
                            int tempHeight, tempWidth;
4622
                            if (ch == 0)
4623
                            {
4624
                                tempHeight = dummyHeight;
4625
                                tempWidth = dummyWidth;
4626
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4627
                                refStride = tmpPredYuv.m_size;
4628
                                costInterTemp += m_me.bufSAD(ref, refStride);
4629
                            }
4630
                            else
4631
                            {
4632
                                tempHeight = dummyHeight >> m_vChromaShift;
4633
                                tempWidth = dummyWidth >> m_hChromaShift;
4634
4635
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4636
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4637
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4638
                                refStride = tmpPredYuv.m_csize;
4639
                                costInterTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4640
                            }
4641
4642
                            if (costInterTemp >= costInterBest)
4643
                            {
4644
                                break;
4645
                            }
4646
                        }
4647
                        cu.setPURefIdx(refList, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4648
4649
                        costInterTemp += m_me.bitcost(cMv, mvPred[mvpIdx]);
4650
                        costInterTemp += m_rdCost.getCost(bitsTemp);
4651
4652
                        if (costInterTemp < costInterBest)
4653
                        {
4654
                            costInterBest = costInterTemp;
4655
                            bestInterMvpIdx[combo] = mvpIdx;
4656
                            bestInterDir[combo] = refList;
4657
                            bestRefIdx[combo] = refIdx;
4658
                            cMvPredCand[combo][partIdx] = mvPred[mvpIdx];
4659
                        }
4660
                    }
4661
                } // end RefIdx and RefList search
4662
4663
                uint32_t MRGInterDir = 0;
4664
                uint32_t MRGIndex = 0;
4665
4666
                // find Merge result
4667
                uint32_t MRGCost = UINT32_MAX;
4668
                cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4669
4670
                mergeEstimation(cu, cuGeom, pu, partIdx, merge);
4671
                MRGInterDir = merge.dir;
4672
                cMRGMvField[combo][0] = merge.mvField[0];
4673
                cMRGMvField[combo][1] = merge.mvField[1];
4674
                MRGIndex = merge.index;
4675
                cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4676
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4677
4678
                if (MRGCost < costInterBest)
4679
                {
4680
                    costInterBest = MRGCost;
4681
                    isMergeMode[combo] = true;
4682
                    bestInterMvpIdx[combo] = MRGIndex;
4683
                    bestInterDir[combo] = MRGInterDir;
4684
                }
4685
4686
                cost[combo] += costInterBest;
4687
                if (isMergeMode[combo])
4688
                {
4689
                    cu.setPUInterDir(bestInterDir[combo], pu.puAbsPartIdx, partIdx);
4690
                    cu.setPUMv(0, cMRGMvField[combo][0].mv, pu.puAbsPartIdx, partIdx);
4691
                    cu.setPURefIdx(0, cMRGMvField[combo][0].refIdx, pu.puAbsPartIdx, partIdx);
4692
                    cu.setPUMv(1, cMRGMvField[combo][1].mv, pu.puAbsPartIdx, partIdx);
4693
                    cu.setPURefIdx(1, cMRGMvField[combo][1].refIdx, pu.puAbsPartIdx, partIdx);
4694
                }
4695
                else
4696
                {
4697
                    int refListOpt = bestInterDir[combo];
4698
                    int refIdxOpt = bestRefIdx[combo];
4699
                    if (cu.m_slice->m_useIntegerMv)
4700
                    {
4701
                        cu.setPUMv(refListOpt, (iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, pu.puAbsPartIdx, partIdx);
4702
                    }
4703
                    else
4704
                    {
4705
                        cu.setPUMv(refListOpt, iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt], pu.puAbsPartIdx, partIdx);
4706
                    }
4707
                    cu.setPURefIdx(refListOpt, refIdxOpt, pu.puAbsPartIdx, partIdx);
4708
                    cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4709
                    cu.setPUInterDir(1 + refListOpt, pu.puAbsPartIdx, partIdx);
4710
                    cu.m_mvpIdx[refListOpt][pu.puAbsPartIdx] = bestInterMvpIdx[combo];
4711
                }
4712
            }
4713
        } // for ipartIdx
4714
    } // for combo
4715
4716
    if (IBCValidFlag > 1)
4717
    {
4718
        return false;
4719
    }
4720
4721
    MV cMvd;
4722
    MV cMVFinal;
4723
    if (cost[0] <= cost[1])
4724
    {
4725
        int iDummyWidth1, iDummyHeight1;
4726
        uint32_t partAddr = 0;
4727
        uint32_t partIdx = 0;
4728
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4729
4730
        if (isIBCMergeMode[0])
4731
        {
4732
            cu.m_mergeFlag[partAddr] = true;
4733
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4734
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4735
            cu.setPUMv(0, cMRGMvFieldIBC[0][0].mv, partAddr, partIdx);
4736
            cu.setPURefIdx(0, cMRGMvFieldIBC[0][0].refIdx, partAddr, partIdx);
4737
            cu.setPUMv(1, cMRGMvFieldIBC[0][1].mv, partAddr, partIdx);
4738
            cu.setPURefIdx(1, cMRGMvFieldIBC[0][1].refIdx, partAddr, partIdx);
4739
4740
            cu.m_mvd[0][partAddr] = cMvZero;
4741
            cu.m_mvd[1][partAddr] = cMvZero;
4742
        }
4743
        else
4744
        {
4745
            cu.m_mergeFlag[partAddr] = false;
4746
4747
            cMvd.set((iMvCandList[8].x - cMvPredCand[0][0].x) >> 2, (iMvCandList[8].y - cMvPredCand[0][0].y) >> 2);
4748
            cu.setPUMv(0, iMvCandList[8], partAddr, partIdx);
4749
            cu.m_mvd[0][partAddr] = cMvd;
4750
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4751
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4752
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4753
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4754
        }
4755
4756
        partIdx = 1;
4757
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4758
4759
        if (isMergeMode[0])
4760
        {
4761
            cu.m_mergeFlag[partAddr] = true;
4762
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[0];
4763
            cu.setPUInterDir(bestInterDir[0], partAddr, partIdx);  // list 0 prediction
4764
            cu.setPUMv(0, cMRGMvField[0][0].mv, partAddr, partIdx);
4765
            cu.setPURefIdx(0, cMRGMvField[0][0].refIdx, partAddr, partIdx);
4766
            cu.setPUMv(1, cMRGMvField[0][1].mv, partAddr, partIdx);
4767
            cu.setPURefIdx(1, cMRGMvField[0][1].refIdx, partAddr, partIdx);
4768
4769
            cu.m_mvd[0][partAddr] = cMvZero;
4770
            cu.m_mvd[1][partAddr] = cMvZero;
4771
        }
4772
        else
4773
        {
4774
            int refListOpt = bestInterDir[0];
4775
            int refIdxOpt = bestRefIdx[0];
4776
            if (cu.m_slice->m_useIntegerMv)
4777
            {
4778
                cMvd.set(((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[0][1].x >> 2)), ((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[0][1].y >> 2)));
4779
                cu.setPUMv(refListOpt, (iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4780
            }
4781
            else
4782
            {
4783
                cMvd.set(iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[0][1].x, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[0][1].y);
4784
                cu.setPUMv(refListOpt, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4785
            }
4786
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4787
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4788
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4789
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4790
            cu.m_mergeFlag[partAddr] = false;
4791
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[0];
4792
        }
4793
    }
4794
    else
4795
    {
4796
        int dummyWidth2, dummyHeight2;
4797
        uint32_t partAddr = 0;
4798
        uint32_t partIdx = 0;
4799
4800
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4801
4802
        if (isMergeMode[1])
4803
        {
4804
            cu.m_mergeFlag[partAddr] = true;
4805
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[1];
4806
            cu.setPUInterDir(bestInterDir[1], partAddr, partIdx);  // list 0 prediction
4807
            cu.setPUMv(0, cMRGMvField[1][0].mv, partAddr, partIdx);
4808
            cu.setPURefIdx(0, cMRGMvField[1][0].refIdx, partAddr, partIdx);
4809
            cu.setPUMv(1, cMRGMvField[1][1].mv, partAddr, partIdx);
4810
            cu.setPURefIdx(1, cMRGMvField[1][1].refIdx, partAddr, partIdx);
4811
4812
            cu.m_mvd[0][partAddr] = cMvZero;
4813
            cu.m_mvd[1][partAddr] = cMvZero;
4814
        }
4815
        else
4816
        {
4817
            int refListOpt = bestInterDir[1];
4818
            int refIdxOpt = bestRefIdx[1];
4819
            if (cu.m_slice->m_useIntegerMv)
4820
            {
4821
                cMvd.set((iMvCandList[2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[1][0].x >> 2), (iMvCandList[2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[1][0].y >> 2));
4822
                cu.setPUMv(refListOpt, (iMvCandList[2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4823
            }
4824
            else
4825
            {
4826
                cMvd.set(iMvCandList[2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[1][0].x, iMvCandList[2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[1][0].y);
4827
                cu.setPUMv(refListOpt, iMvCandList[2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4828
            }
4829
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4830
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4831
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4832
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4833
            cu.m_mergeFlag[partAddr] = false;
4834
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[1];
4835
        }
4836
4837
        partIdx = 1;
4838
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4839
4840
        if (isIBCMergeMode[1])
4841
        {
4842
            cu.m_mergeFlag[partAddr] = true;
4843
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4844
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4845
            cu.setPUMv(0, cMRGMvFieldIBC[1][0].mv, partAddr, partIdx);
4846
            cu.setPURefIdx(0, cMRGMvFieldIBC[1][0].refIdx, partAddr, partIdx);
4847
            cu.setPUMv(1, cMRGMvFieldIBC[1][1].mv, partAddr, partIdx);
4848
            cu.setPURefIdx(1, cMRGMvFieldIBC[1][1].refIdx, partAddr, partIdx);
4849
4850
            cu.m_mvd[0][partAddr] = cMvZero;
4851
            cu.m_mvd[1][partAddr] = cMvZero;
4852
        }
4853
        else
4854
        {
4855
            cu.m_mergeFlag[partAddr] = false;
4856
4857
            cMvd.set(((iMvCandList[9].x - cMvPredCand[1][1].x) >> 2), (iMvCandList[9].y - cMvPredCand[1][1].y) >> 2);
4858
            cu.setPUMv(0, iMvCandList[9], partAddr, partIdx);
4859
            cu.m_mvd[0][partAddr] = cMvd;
4860
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4861
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4862
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4863
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4864
        }
4865
    }
4866
    for (int partIdx = 0; partIdx < numPart; ++partIdx)
4867
    {
4868
        PredictionUnit pu(cu, cuGeom, partIdx);
4869
        motionCompensation(cu, pu, *predYuv, 1, 1);
4870
    }
4871
4872
    return true;
4873
}
4874
#endif
4875
4876
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
4877
0
{
4878
0
    if (cuMode == SIZE_2Nx2N)
4879
0
    {
4880
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4881
0
        blockBit[1] = 3;
4882
0
        blockBit[2] = 5;
4883
0
    }
4884
0
    else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
4885
0
    {
4886
0
        static const uint32_t listBits[2][3][3] =
4887
0
        {
4888
0
            { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4889
0
            { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
4890
0
        };
4891
0
        if (bPSlice)
4892
0
        {
4893
0
            blockBit[0] = 3;
4894
0
            blockBit[1] = 0;
4895
0
            blockBit[2] = 0;
4896
0
        }
4897
0
        else
4898
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4899
0
    }
4900
0
    else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
4901
0
    {
4902
0
        static const uint32_t listBits[2][3][3] =
4903
0
        {
4904
0
            { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4905
0
            { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
4906
0
        };
4907
0
        if (bPSlice)
4908
0
        {
4909
0
            blockBit[0] = 3;
4910
0
            blockBit[1] = 0;
4911
0
            blockBit[2] = 0;
4912
0
        }
4913
0
        else
4914
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4915
0
    }
4916
0
    else if (cuMode == SIZE_NxN)
4917
0
    {
4918
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4919
0
        blockBit[1] = 3;
4920
0
        blockBit[2] = 5;
4921
0
    }
4922
0
    else
4923
0
    {
4924
0
        X265_CHECK(0, "getBlkBits: unknown cuMode\n");
4925
0
    }
4926
0
}
4927
4928
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
4929
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
4930
0
{
4931
0
    int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
4932
0
    if (diffBits < 0)
4933
0
    {
4934
0
        mvpIdx = !mvpIdx;
4935
0
        uint32_t origOutBits = outBits;
4936
0
        outBits = origOutBits + diffBits;
4937
0
        outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4938
0
    }
4939
0
    return amvpCand[mvpIdx];
4940
0
}
4941
4942
/* Update to default MVP when using an alternative mvp */
4943
void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
4944
0
{
4945
0
    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
4946
0
    uint32_t origOutBits = outBits;
4947
0
    outBits = origOutBits + diffBits;
4948
0
    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4949
0
}
4950
4951
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
4952
0
{
4953
0
    MV dist((int32_t)merange << 2, (int32_t)merange << 2);
4954
0
    mvmin = mvp - dist;
4955
0
    mvmax = mvp + dist;
4956
4957
0
    if (m_vertRestriction)
4958
0
    {
4959
0
        int mvRestricted = (56 - 1) << 2; // -1 to consider subpel search
4960
0
        if (mvmax.y >= mvRestricted)
4961
0
        {
4962
0
            mvmax.y = mvRestricted; //only positive side is restricted
4963
0
        }
4964
0
    }
4965
4966
0
    cu.clipMv(mvmin);
4967
0
    cu.clipMv(mvmax);
4968
4969
0
    if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
4970
0
          cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
4971
0
          m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
4972
0
    {
4973
0
        int safeX, maxSafeMv;
4974
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
4975
0
        maxSafeMv = (safeX - cu.m_cuPelX) * 4;
4976
0
        mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
4977
0
        mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
4978
0
    }
4979
4980
    // apply restrict on slices
4981
0
    if ((m_param->maxSlices > 1) & m_bFrameParallel)
4982
0
    {
4983
0
        mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
4984
0
        mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
4985
0
    }
4986
4987
    /* Clip search range to signaled maximum MV length.
4988
     * We do not support this VUI field being changed from the default */
4989
0
    const int maxMvLen = (1 << 15) - 1;
4990
0
    mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
4991
0
    mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
4992
0
    mvmax.x = X265_MIN(mvmax.x, maxMvLen);
4993
0
    mvmax.y = X265_MIN(mvmax.y, maxMvLen);
4994
4995
0
    mvmin >>= 2;
4996
0
    mvmax >>= 2;
4997
4998
    /* conditional clipping for frame parallelism */
4999
0
    mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels);
5000
0
    mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels);
5001
5002
    /* conditional clipping for negative mv range */
5003
0
    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
5004
0
}
5005
5006
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
5007
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
5008
0
{
5009
0
    CUData& cu = interMode.cu;
5010
0
    Yuv* reconYuv = &interMode.reconYuv;
5011
0
    const Yuv* fencYuv = interMode.fencYuv;
5012
0
    Yuv* predYuv = &interMode.predYuv;
5013
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
5014
0
    uint32_t depth  = cu.m_cuDepth[0];
5015
5016
    // No residual coding : SKIP mode
5017
5018
0
    cu.setPredModeSubParts(MODE_SKIP);
5019
0
    cu.clearCbf();
5020
0
    cu.setTUDepthSubParts(0, 0, depth);
5021
5022
0
    reconYuv->copyFromYuv(interMode.predYuv);
5023
5024
    // Luma
5025
0
    int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
5026
0
    interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5027
0
    interMode.distortion = interMode.lumaDistortion;
5028
    // Chroma
5029
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5030
0
    {
5031
0
        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
5032
0
        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
5033
0
        interMode.distortion += interMode.chromaDistortion;
5034
0
    }
5035
0
    cu.m_distortion[0] = interMode.distortion;
5036
0
    m_entropyCoder.load(m_rqt[depth].cur);
5037
0
    m_entropyCoder.resetBits();
5038
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
5039
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
5040
0
    m_entropyCoder.codeSkipFlag(cu, 0);
5041
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5042
0
    m_entropyCoder.codeMergeIndex(cu, 0);
5043
0
    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5044
0
    interMode.coeffBits = 0;
5045
0
    interMode.totalBits = interMode.mvBits + skipFlagBits;
5046
0
    if (m_rdCost.m_psyRd)
5047
0
        interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5048
0
    else if(m_rdCost.m_ssimRd)
5049
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
5050
5051
0
    interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5052
0
    updateModeCost(interMode);
5053
0
    m_entropyCoder.store(interMode.contexts);
5054
0
}
5055
5056
/* encode residual and calculate rate-distortion for a CU block.
5057
 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
5058
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
5059
0
{
5060
0
    ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
5061
5062
0
    CUData& cu = interMode.cu;
5063
0
    Yuv* reconYuv = &interMode.reconYuv;
5064
0
    Yuv* predYuv = &interMode.predYuv;
5065
0
    uint32_t depth = cuGeom.depth;
5066
0
    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
5067
0
    const Yuv* fencYuv = interMode.fencYuv;
5068
5069
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
5070
5071
0
    uint32_t log2CUSize = cuGeom.log2CUSize;
5072
0
    int sizeIdx = log2CUSize - 2;
5073
5074
0
    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
5075
5076
0
    uint32_t tuDepthRange[2];
5077
0
    cu.getInterTUQtDepthRange(tuDepthRange, 0);
5078
5079
0
    m_entropyCoder.load(m_rqt[depth].cur);
5080
5081
0
    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
5082
0
        m_maxTUDepth = -1;
5083
0
    else if (m_limitTU & X265_TU_LIMIT_BFS)
5084
0
        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
5085
5086
0
    Cost costs;
5087
0
    if (m_limitTU & X265_TU_LIMIT_NEIGH)
5088
0
    {
5089
        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
5090
0
        int32_t tempDepth = m_maxTUDepth;
5091
0
        if (m_maxTUDepth != -1)
5092
0
        {
5093
0
            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
5094
0
            uint32_t minSize = tuDepthRange[0];
5095
0
            uint32_t maxSize = tuDepthRange[1];
5096
0
            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
5097
0
            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
5098
0
        }
5099
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
5100
0
        m_maxTUDepth = tempDepth;
5101
0
    }
5102
0
    else
5103
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
5104
5105
0
    uint32_t tqBypass = cu.m_tqBypass[0];
5106
0
    if (!tqBypass)
5107
0
    {
5108
0
        sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5109
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5110
0
        {
5111
0
            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
5112
0
            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
5113
0
        }
5114
5115
        /* Consider the RD cost of not signaling any residual */
5116
0
        m_entropyCoder.load(m_rqt[depth].cur);
5117
0
        m_entropyCoder.resetBits();
5118
0
        m_entropyCoder.codeQtRootCbfZero();
5119
0
        uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
5120
5121
0
        uint32_t cbf0Energy; uint64_t cbf0Cost;
5122
0
        if (m_rdCost.m_psyRd)
5123
0
        {
5124
0
            cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5125
0
            cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
5126
0
        }
5127
0
        else if(m_rdCost.m_ssimRd)
5128
0
        {
5129
0
            cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
5130
0
            cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
5131
0
        }
5132
0
        else
5133
0
            cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
5134
5135
0
        if (cbf0Cost < costs.rdcost)
5136
0
        {
5137
0
            cu.clearCbf();
5138
0
            cu.setTUDepthSubParts(0, 0, depth);
5139
0
        }
5140
0
    }
5141
5142
0
    if (cu.getQtRootCbf(0))
5143
0
        saveResidualQTData(cu, *resiYuv, 0, 0);
5144
5145
    /* calculate signal bits for inter/merge/skip coded CU */
5146
0
    m_entropyCoder.load(m_rqt[depth].cur);
5147
5148
0
    m_entropyCoder.resetBits();
5149
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
5150
0
        m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
5151
5152
0
    uint32_t coeffBits, bits, mvBits;
5153
0
    if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
5154
0
    {
5155
0
        cu.setPredModeSubParts(MODE_SKIP);
5156
5157
        /* Merge/Skip */
5158
0
        coeffBits = mvBits = 0;
5159
0
        m_entropyCoder.codeSkipFlag(cu, 0);
5160
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5161
0
        m_entropyCoder.codeMergeIndex(cu, 0);
5162
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5163
0
        bits = mvBits + skipFlagBits;
5164
0
    }
5165
0
    else
5166
0
    {
5167
0
        m_entropyCoder.codeSkipFlag(cu, 0);
5168
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5169
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
5170
0
        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
5171
0
        m_entropyCoder.codePredInfo(cu, 0);
5172
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5173
5174
0
        bool bCodeDQP = m_slice->m_pps->bUseDQP;
5175
0
        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
5176
0
        bits = m_entropyCoder.getNumberOfWrittenBits();
5177
5178
0
        coeffBits = bits - mvBits - skipFlagBits;
5179
0
    }
5180
5181
0
    m_entropyCoder.store(interMode.contexts);
5182
5183
0
    if (cu.getQtRootCbf(0))
5184
0
        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
5185
0
    else
5186
0
        reconYuv->copyFromYuv(*predYuv);
5187
5188
    // update with clipped distortion and cost (qp estimation loop uses unclipped values)
5189
0
    sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5190
0
    interMode.distortion = bestLumaDist;
5191
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5192
0
    {
5193
0
        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
5194
0
        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
5195
0
        interMode.chromaDistortion = bestChromaDist;
5196
0
        interMode.distortion += bestChromaDist;
5197
0
    }
5198
0
    if (m_rdCost.m_psyRd)
5199
0
        interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5200
0
    else if(m_rdCost.m_ssimRd)
5201
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
5202
5203
0
    interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5204
0
    interMode.totalBits = bits;
5205
0
    interMode.lumaDistortion = bestLumaDist;
5206
0
    interMode.coeffBits = coeffBits;
5207
0
    interMode.mvBits = mvBits;
5208
0
    cu.m_distortion[0] = interMode.distortion;
5209
0
    updateModeCost(interMode);
5210
0
    checkDQP(interMode, cuGeom);
5211
5212
#if ENABLE_SCC_EXT
5213
    if (m_param->bEnableSCC)
5214
        interMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
5215
#endif
5216
0
}
5217
5218
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
5219
0
{
5220
0
    uint32_t depth = cuGeom.depth + tuDepth;
5221
0
    CUData& cu = mode.cu;
5222
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5223
5224
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5225
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
5226
0
        bCheckFull = false;
5227
5228
0
    if (bCheckFull)
5229
0
    {
5230
        // code full block
5231
0
        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5232
0
        uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5233
5234
0
        uint32_t tuDepthC = tuDepth;
5235
0
        if (log2TrSizeC < 2)
5236
0
        {
5237
0
            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5238
0
            log2TrSizeC = 2;
5239
0
            tuDepthC--;
5240
0
            codeChroma &= !(absPartIdx & 3);
5241
0
        }
5242
5243
0
        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5244
0
        uint32_t setCbf = 1 << tuDepth;
5245
5246
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5247
0
        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
5248
5249
0
        uint32_t sizeIdx  = log2TrSize  - 2;
5250
5251
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5252
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5253
5254
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
5255
0
        const Yuv* fencYuv = mode.fencYuv;
5256
5257
0
        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
5258
0
        uint32_t strideResiY = resiYuv.m_size;
5259
5260
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5261
0
        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5262
5263
0
        if (numSigY)
5264
0
        {
5265
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
5266
0
            cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
5267
0
        }
5268
0
        else
5269
0
        {
5270
0
            primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5271
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
5272
0
        }
5273
5274
0
        if (codeChroma)
5275
0
        {
5276
0
            uint32_t sizeIdxC = log2TrSizeC - 2;
5277
0
            uint32_t strideResiC = resiYuv.m_csize;
5278
5279
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5280
0
            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
5281
0
            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
5282
0
            bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5283
5284
0
            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5285
0
            do
5286
0
            {
5287
0
                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5288
0
                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5289
5290
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5291
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5292
5293
0
                int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
5294
0
                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
5295
0
                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
5296
0
                if (numSigU)
5297
0
                {
5298
0
                    m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
5299
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5300
0
                }
5301
0
                else
5302
0
                {
5303
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
5304
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5305
0
                }
5306
5307
0
                int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
5308
0
                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
5309
0
                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
5310
0
                if (numSigV)
5311
0
                {
5312
0
                    m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
5313
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5314
0
                }
5315
0
                else
5316
0
                {
5317
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
5318
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5319
0
                }
5320
0
            }
5321
0
            while (tuIterator.isNextSection());
5322
5323
0
            if (splitIntoSubTUs)
5324
0
            {
5325
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5326
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5327
0
            }
5328
0
        }
5329
0
    }
5330
0
    else
5331
0
    {
5332
0
        X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
5333
5334
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5335
0
        uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
5336
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
5337
0
        {
5338
0
            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
5339
0
            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
5340
0
            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5341
0
            {
5342
0
                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
5343
0
                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
5344
0
            }
5345
0
        }
5346
0
        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
5347
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5348
0
        {
5349
0
            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
5350
0
            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
5351
0
        }
5352
0
    }
5353
0
}
5354
5355
uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
5356
0
{
5357
0
    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
5358
5359
0
    if (m_rdCost.m_psyRd)
5360
0
        return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
5361
0
    else if(m_rdCost.m_ssimRd)
5362
0
        return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
5363
0
    else
5364
0
        return m_rdCost.calcRdCost(dist, nullBits);
5365
0
}
5366
5367
bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
5368
0
{
5369
0
    CUData& cu = mode.cu;
5370
0
    uint32_t depth = cuGeom.depth + tuDepth;
5371
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5372
5373
0
    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5374
0
    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
5375
0
    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
5376
0
    {
5377
0
        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
5378
0
        {
5379
0
            m_maxTUDepth = cu.m_tuDepth[0];
5380
            // Fetch maximum TU depth of first sub partition to limit recursion of others
5381
0
            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
5382
0
                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
5383
0
        }
5384
0
        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
5385
0
        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
5386
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5387
0
        {
5388
0
            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
5389
0
            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
5390
0
        }
5391
0
    }
5392
0
    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
5393
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5394
0
    {
5395
0
        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
5396
0
        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
5397
0
    }
5398
5399
    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
5400
    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
5401
    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
5402
    // at depth 0 (for example).
5403
0
    m_entropyCoder.load(m_rqt[depth].rqtRoot);
5404
0
    m_entropyCoder.resetBits();
5405
0
    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
5406
0
    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
5407
0
    splitCost.bits += splitCbfBits;
5408
5409
0
    if (m_rdCost.m_psyRd)
5410
0
        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
5411
0
    else if(m_rdCost.m_ssimRd)
5412
0
        splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
5413
0
    else
5414
0
        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
5415
        
5416
0
    return ycbf || ucbf || vcbf;
5417
0
}
5418
5419
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
5420
0
{
5421
0
    CUData& cu = mode.cu;
5422
0
    uint32_t depth = cuGeom.depth + tuDepth;
5423
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5424
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
5425
5426
0
    bool bCheckSplit = log2TrSize > depthRange[0];
5427
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5428
0
    bool bSaveTUData = false, bLoadTUData = false;
5429
0
    uint32_t idx = 0;
5430
5431
0
    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
5432
0
    {
5433
0
        if (bCheckSplit && bCheckFull && tuDepth)
5434
0
        {
5435
0
            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
5436
0
            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
5437
0
            idx = (depth - 1) * 4 + qIdx;
5438
0
            if (splitMore)
5439
0
            {
5440
0
                bLoadTUData = true;
5441
0
                bCheckFull = false;
5442
0
            }
5443
0
            else
5444
0
            {
5445
0
                bSaveTUData = true;
5446
0
                bCheckSplit = false;
5447
0
            }
5448
0
        }
5449
0
    }
5450
0
    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
5451
0
    {
5452
0
        if (bCheckSplit && m_maxTUDepth >= 0)
5453
0
        {
5454
0
            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
5455
0
            bCheckSplit = log2TrSize > log2MaxTrSize;
5456
0
        }
5457
0
    }
5458
5459
0
    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
5460
5461
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
5462
0
        bCheckFull = false;
5463
5464
0
    X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
5465
5466
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5467
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5468
0
    uint32_t tuDepthC = tuDepth;
5469
0
    if (log2TrSizeC < 2)
5470
0
    {
5471
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5472
0
        log2TrSizeC = 2;
5473
0
        tuDepthC--;
5474
0
        codeChroma &= !(absPartIdx & 3);
5475
0
    }
5476
5477
    // code full block
5478
0
    Cost fullCost;
5479
0
    fullCost.rdcost = MAX_INT64;
5480
5481
0
    uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5482
0
    uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5483
0
    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5484
0
    sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5485
0
    uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5486
0
    uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5487
0
    uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
5488
5489
0
    m_entropyCoder.store(m_rqt[depth].rqtRoot);
5490
5491
0
    uint32_t trSize = 1 << log2TrSize;
5492
0
    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5493
0
    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5494
0
    const Yuv* fencYuv = mode.fencYuv;
5495
5496
    // code full block
5497
0
    if (bCheckFull)
5498
0
    {
5499
0
        uint32_t trSizeC = 1 << log2TrSizeC;
5500
0
        int partSize = partitionFromLog2Size(log2TrSize);
5501
0
        int partSizeC = partitionFromLog2Size(log2TrSizeC);
5502
0
        const uint32_t qtLayer = log2TrSize - 2;
5503
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5504
0
        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
5505
5506
0
        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
5507
0
        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
5508
0
        bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
5509
5510
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5511
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5512
5513
0
        if (bEnableRDOQ)
5514
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5515
5516
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5517
0
        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
5518
0
        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5519
0
        cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
5520
5521
0
        m_entropyCoder.resetBits();
5522
5523
0
        if (bSplitPresentFlag && log2TrSize > depthRange[0])
5524
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
5525
5526
0
        if (cbfFlag[TEXT_LUMA][0])
5527
0
            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
5528
0
        singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
5529
5530
0
        X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
5531
5532
        //Assuming zero residual 
5533
0
        sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5534
0
        uint32_t zeroEnergyY = 0;
5535
0
        if (m_rdCost.m_psyRd)
5536
0
            zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5537
0
        else if(m_rdCost.m_ssimRd)
5538
0
            zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
5539
5540
0
        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
5541
0
        uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
5542
5543
0
        if (cbfFlag[TEXT_LUMA][0])
5544
0
        {
5545
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
5546
5547
            // non-zero cost calculation for luma - This is an approximation
5548
            // finally we have to encode correct cbf after comparing with null cost
5549
0
            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
5550
0
            bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
5551
0
            uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
5552
0
            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5553
0
            bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
5554
0
            bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
5555
0
            primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
5556
5557
0
            const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
5558
0
            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
5559
0
            uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
5560
0
            if (m_rdCost.m_psyRd)
5561
0
            {
5562
0
                nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
5563
0
                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5564
0
            }
5565
0
            else if(m_rdCost.m_ssimRd)
5566
0
            {
5567
0
                nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
5568
0
                singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5569
0
            }
5570
0
            else
5571
0
                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
5572
5573
0
            if (cu.m_tqBypass[0])
5574
0
            {
5575
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5576
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5577
0
            }
5578
0
            else
5579
0
            {
5580
                // zero-cost calculation for luma. This is an approximation
5581
                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
5582
                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
5583
0
                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5584
5585
0
                if (nullCostY < singleCostY)
5586
0
                {
5587
0
                    cbfFlag[TEXT_LUMA][0] = 0;
5588
0
                    singleBits[TEXT_LUMA][0] = 0;
5589
0
                    primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5590
#if CHECKED_BUILD || _DEBUG
5591
                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
5592
                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
5593
#endif
5594
0
                    if (checkTransformSkipY)
5595
0
                        minCost[TEXT_LUMA][0] = nullCostY;
5596
0
                    singleDist[TEXT_LUMA][0] = zeroDistY;
5597
0
                    singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5598
0
                }
5599
0
                else
5600
0
                {
5601
0
                    if (checkTransformSkipY)
5602
0
                        minCost[TEXT_LUMA][0] = singleCostY;
5603
0
                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
5604
0
                    singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5605
0
                }
5606
0
            }
5607
0
        }
5608
0
        else
5609
0
        {
5610
0
            if (checkTransformSkipY)
5611
0
                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5612
0
            primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5613
0
            singleDist[TEXT_LUMA][0] = zeroDistY;
5614
0
            singleBits[TEXT_LUMA][0] = 0;
5615
0
            singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5616
0
        }
5617
5618
0
        cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5619
5620
0
        if (codeChroma)
5621
0
        {
5622
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5623
0
            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
5624
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5625
0
            {
5626
0
                sse_t zeroDistC = 0;
5627
0
                uint32_t zeroEnergyC = 0;
5628
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5629
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5630
5631
0
                do
5632
0
                {
5633
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5634
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5635
5636
0
                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5637
5638
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5639
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5640
5641
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5642
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5643
0
                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
5644
0
                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
5645
5646
0
                    uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
5647
0
                    if (cbfFlag[chromaId][tuIterator.section])
5648
0
                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5649
5650
0
                    singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
5651
5652
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5653
0
                    zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
5654
5655
                    // Assuming zero residual 
5656
0
                    if (m_rdCost.m_psyRd)
5657
0
                        zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
5658
0
                    else if(m_rdCost.m_ssimRd)
5659
0
                        zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5660
5661
0
                    if (cbfFlag[chromaId][tuIterator.section])
5662
0
                    {
5663
0
                        m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
5664
0
                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
5665
5666
                        // non-zero cost calculation for luma, same as luma - This is an approximation
5667
                        // finally we have to encode correct cbf after comparing with null cost
5668
0
                        pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
5669
0
                        uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
5670
0
                        bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5671
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5672
0
                        bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5673
0
                        bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
5674
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
5675
0
                        sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
5676
0
                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
5677
0
                        uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
5678
0
                        if (m_rdCost.m_psyRd)
5679
0
                        {
5680
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
5681
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5682
0
                        }
5683
0
                        else if(m_rdCost.m_ssimRd)
5684
0
                        {
5685
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5686
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5687
0
                        }
5688
0
                        else
5689
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
5690
5691
0
                        if (cu.m_tqBypass[0])
5692
0
                        {
5693
0
                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5694
0
                            singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5695
0
                        }
5696
0
                        else
5697
0
                        {
5698
                            //zero-cost calculation for chroma. This is an approximation
5699
0
                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
5700
5701
0
                            if (nullCostC < singleCostC)
5702
0
                            {
5703
0
                                cbfFlag[chromaId][tuIterator.section] = 0;
5704
0
                                singleBits[chromaId][tuIterator.section] = 0;
5705
0
                                primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5706
#if CHECKED_BUILD || _DEBUG
5707
                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5708
                                memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
5709
#endif
5710
0
                                if (checkTransformSkipC)
5711
0
                                    minCost[chromaId][tuIterator.section] = nullCostC;
5712
0
                                singleDist[chromaId][tuIterator.section] = zeroDistC;
5713
0
                                singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5714
0
                            }
5715
0
                            else
5716
0
                            {
5717
0
                                if (checkTransformSkipC)
5718
0
                                    minCost[chromaId][tuIterator.section] = singleCostC;
5719
0
                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5720
0
                                singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5721
0
                            }
5722
0
                        }
5723
0
                    }
5724
0
                    else
5725
0
                    {
5726
0
                        if (checkTransformSkipC)
5727
0
                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
5728
0
                        primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5729
0
                        singleBits[chromaId][tuIterator.section] = 0;
5730
0
                        singleDist[chromaId][tuIterator.section] = zeroDistC;
5731
0
                        singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5732
0
                    }
5733
5734
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5735
0
                }
5736
0
                while (tuIterator.isNextSection());
5737
0
            }
5738
0
        }
5739
5740
0
        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
5741
0
        {
5742
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5743
0
            {
5744
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5745
0
                do
5746
0
                {
5747
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5748
0
                    cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5749
0
                }
5750
0
                while(tuIterator.isNextSection());
5751
0
            }
5752
0
        }
5753
0
        if (checkTransformSkipY)
5754
0
        {
5755
0
            sse_t nonZeroDistY = 0;
5756
0
            uint32_t nonZeroEnergyY = 0;
5757
0
            uint64_t singleCostY = MAX_INT64;
5758
5759
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5760
5761
0
            cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
5762
5763
0
            if (bEnableRDOQ)
5764
0
                m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5765
5766
0
            fenc = fencYuv->getLumaAddr(absPartIdx);
5767
0
            resi = resiYuv.getLumaAddr(absPartIdx);
5768
0
            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
5769
5770
0
            if (numSigTSkipY)
5771
0
            {
5772
0
                m_entropyCoder.resetBits();
5773
0
                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
5774
0
                m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
5775
0
                const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
5776
5777
0
                m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
5778
0
                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5779
5780
0
                bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
5781
0
                primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
5782
0
                nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
5783
5784
0
                if (m_rdCost.m_psyRd)
5785
0
                {
5786
0
                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
5787
0
                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5788
0
                }
5789
0
                else if(m_rdCost.m_ssimRd)
5790
0
                {
5791
0
                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
5792
0
                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5793
0
                }
5794
0
                else
5795
0
                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
5796
0
            }
5797
5798
0
            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
5799
0
                cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5800
0
            else
5801
0
            {
5802
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5803
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5804
0
                cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
5805
0
                bestTransformMode[TEXT_LUMA][0] = 1;
5806
0
                if (m_param->limitTU)
5807
0
                    numSig[TEXT_LUMA][0] = numSigTSkipY;
5808
0
                uint32_t numCoeffY = 1 << (log2TrSize << 1);
5809
0
                memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
5810
0
                primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
5811
0
            }
5812
5813
0
            cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5814
0
        }
5815
5816
0
        if (codeChroma && checkTransformSkipC)
5817
0
        {
5818
0
            sse_t nonZeroDistC = 0;
5819
0
            uint32_t nonZeroEnergyC = 0;
5820
0
            uint64_t singleCostC = MAX_INT64;
5821
0
            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
5822
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5823
5824
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5825
5826
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5827
0
            {
5828
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5829
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5830
5831
0
                do
5832
0
                {
5833
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5834
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5835
5836
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5837
5838
0
                    cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5839
5840
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5841
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5842
5843
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5844
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5845
0
                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
5846
5847
0
                    m_entropyCoder.resetBits();
5848
0
                    singleBits[chromaId][tuIterator.section] = 0;
5849
5850
0
                    if (numSigTSkipC)
5851
0
                    {
5852
0
                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
5853
0
                        m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5854
0
                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
5855
5856
0
                        m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
5857
0
                                                log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
5858
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5859
0
                        bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
5860
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
5861
0
                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
5862
0
                        if (m_rdCost.m_psyRd)
5863
0
                        {
5864
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
5865
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5866
0
                        }
5867
0
                        else if(m_rdCost.m_ssimRd)
5868
0
                        {
5869
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5870
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5871
0
                        }
5872
0
                        else
5873
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
5874
0
                    }
5875
5876
0
                    if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
5877
0
                        cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5878
0
                    else
5879
0
                    {
5880
0
                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5881
0
                        singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5882
0
                        cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
5883
0
                        bestTransformMode[chromaId][tuIterator.section] = 1;
5884
0
                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5885
0
                        memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
5886
0
                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
5887
0
                    }
5888
5889
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5890
0
                }
5891
0
                while (tuIterator.isNextSection());
5892
0
            }
5893
0
        }
5894
5895
        // Here we were encoding cbfs and coefficients, after calculating distortion above.
5896
        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
5897
        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
5898
        // make any difference. But bit confused whether I should load the original context as below.
5899
0
        m_entropyCoder.load(m_rqt[depth].rqtRoot);
5900
0
        m_entropyCoder.resetBits();
5901
5902
        //Encode cbf flags
5903
0
        if (codeChroma)
5904
0
        {
5905
0
            if (!splitIntoSubTUs)
5906
0
            {
5907
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5908
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5909
0
            }
5910
0
            else
5911
0
            {
5912
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5913
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5914
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5915
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
5916
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5917
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
5918
0
            }
5919
0
        }
5920
5921
0
        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
5922
5923
0
        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
5924
5925
0
        uint32_t coeffBits = 0;
5926
0
        coeffBits = singleBits[TEXT_LUMA][0];
5927
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5928
0
        {
5929
0
            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
5930
0
            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
5931
0
        }
5932
5933
        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
5934
        // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
5935
        // four split block's individual cbf value. This is not known before analysis of four split blocks.
5936
        // For that reason, I am collecting individual coefficient bits only.
5937
0
        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
5938
5939
0
        fullCost.distortion += singleDist[TEXT_LUMA][0];
5940
0
        fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
5941
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5942
0
        {
5943
0
            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
5944
0
            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
5945
0
        }
5946
5947
0
        if (m_rdCost.m_psyRd)
5948
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5949
0
        else if(m_rdCost.m_ssimRd)
5950
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5951
0
        else
5952
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
5953
5954
0
        if (m_param->limitTU && bCheckSplit)
5955
0
        {
5956
            // Stop recursion if the TU's energy level is minimal
5957
0
            uint32_t numCoeff = trSize * trSize;
5958
0
            if (cbfFlag[TEXT_LUMA][0] == 0)
5959
0
                bCheckSplit = false;
5960
0
            else if (numSig[TEXT_LUMA][0] < (numCoeff / 64))
5961
0
            {
5962
0
                uint32_t energy = 0;
5963
0
                for (uint32_t i = 0; i < numCoeff; i++)
5964
0
                    energy += abs(coeffCurY[i]);
5965
0
                if (energy == numSig[TEXT_LUMA][0])
5966
0
                    bCheckSplit = false;
5967
0
            }
5968
0
        }
5969
5970
0
        if (bSaveTUData)
5971
0
        {
5972
0
            for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5973
0
            {
5974
0
                for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5975
0
                {
5976
0
                    m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
5977
0
                    m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
5978
0
                }
5979
0
            }
5980
0
            m_cacheTU.cost[idx] = fullCost;
5981
0
            m_entropyCoder.store(m_cacheTU.rqtStore[idx]);
5982
0
        }
5983
0
    }
5984
0
    if (bLoadTUData)
5985
0
    {
5986
0
        for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5987
0
        {
5988
0
            for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5989
0
            {
5990
0
                bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part];
5991
0
                cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part];
5992
0
            }
5993
0
        }
5994
0
        fullCost = m_cacheTU.cost[idx];
5995
0
        m_entropyCoder.load(m_cacheTU.rqtStore[idx]);
5996
0
        bCheckFull = true;
5997
0
    }
5998
5999
    // code sub-blocks
6000
0
    if (bCheckSplit)
6001
0
    {
6002
0
        if (bCheckFull)
6003
0
        {
6004
0
            m_entropyCoder.store(m_rqt[depth].rqtTest);
6005
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
6006
0
        }
6007
6008
0
        Cost splitCost;
6009
0
        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
6010
0
        {
6011
            // Subdiv flag can be encoded at the start of analysis of split blocks.
6012
0
            m_entropyCoder.resetBits();
6013
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
6014
0
            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
6015
0
        }
6016
6017
0
        bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
6018
0
        if (yCbCrCbf || !bCheckFull)
6019
0
        {
6020
0
            if (splitCost.rdcost < fullCost.rdcost)
6021
0
            {
6022
0
                if (m_limitTU & X265_TU_LIMIT_BFS)
6023
0
                {
6024
0
                    uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
6025
0
                    bool nextSplit = nextlog2TrSize > depthRange[0];
6026
0
                    if (nextSplit)
6027
0
                    {
6028
0
                        m_entropyCoder.load(m_rqt[depth].rqtRoot);
6029
0
                        splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
6030
0
                        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
6031
0
                        {
6032
                            // Subdiv flag can be encoded at the start of analysis of split blocks.
6033
0
                            m_entropyCoder.resetBits();
6034
0
                            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
6035
0
                            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
6036
0
                        }
6037
0
                        splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1);
6038
0
                    }
6039
0
                }
6040
0
                outCosts.distortion += splitCost.distortion;
6041
0
                outCosts.rdcost     += splitCost.rdcost;
6042
0
                outCosts.bits       += splitCost.bits;
6043
0
                outCosts.energy     += splitCost.energy;
6044
0
                return;
6045
0
            }
6046
0
            else
6047
0
                outCosts.energy     += splitCost.energy;
6048
0
        }
6049
6050
0
        cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
6051
0
        if (codeChroma)
6052
0
        {
6053
0
            if (!splitIntoSubTUs)
6054
0
            {
6055
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
6056
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
6057
0
            }
6058
0
            else
6059
0
            {
6060
0
                uint32_t tuNumParts = absPartIdxStep >> 1;
6061
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
6062
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
6063
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
6064
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
6065
0
            }
6066
0
        }
6067
0
        X265_CHECK(bCheckFull, "check-full must be set\n");
6068
0
        m_entropyCoder.load(m_rqt[depth].rqtTest);
6069
0
    }
6070
6071
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
6072
0
    cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
6073
6074
0
    if (codeChroma)
6075
0
    {
6076
0
        if (!splitIntoSubTUs)
6077
0
        {
6078
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
6079
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
6080
0
        }
6081
0
        else
6082
0
        {
6083
0
            uint32_t tuNumParts = absPartIdxStep >> 1;
6084
6085
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
6086
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
6087
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
6088
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
6089
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
6090
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
6091
0
        }
6092
0
    }
6093
6094
0
    outCosts.distortion += fullCost.distortion;
6095
0
    outCosts.rdcost     += fullCost.rdcost;
6096
0
    outCosts.bits       += fullCost.bits;
6097
0
    outCosts.energy     += fullCost.energy;
6098
0
}
6099
6100
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
6101
0
{
6102
0
    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
6103
6104
0
    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
6105
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
6106
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
6107
0
    {
6108
0
        if (!(log2TrSize - m_hChromaShift < 2))
6109
0
        {
6110
0
            uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
6111
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
6112
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
6113
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
6114
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
6115
0
        }
6116
0
    }
6117
6118
0
    if (!bSubdiv)
6119
0
    {
6120
0
        m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
6121
0
    }
6122
0
    else
6123
0
    {
6124
0
        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
6125
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
6126
0
            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
6127
0
    }
6128
0
}
6129
6130
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
6131
0
{
6132
0
    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
6133
6134
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
6135
0
    {
6136
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
6137
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
6138
0
            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
6139
0
        return;
6140
0
    }
6141
6142
0
    const uint32_t qtLayer = log2TrSize - 2;
6143
6144
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
6145
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
6146
0
    if (log2TrSizeC < 2)
6147
0
    {
6148
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
6149
0
        log2TrSizeC = 2;
6150
0
        codeChroma &= !(absPartIdx & 3);
6151
0
    }
6152
6153
0
    m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
6154
6155
0
    uint32_t numCoeffY = 1 << (log2TrSize * 2);
6156
0
    uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
6157
0
    coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
6158
0
    coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
6159
0
    memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
6160
6161
0
    if (codeChroma)
6162
0
    {
6163
0
        m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
6164
6165
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
6166
0
        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
6167
6168
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
6169
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
6170
0
        coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
6171
0
        coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
6172
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
6173
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
6174
0
    }
6175
0
}
6176
6177
/* returns the number of bits required to signal a non-most-probable mode.
6178
 * on return mpms contains bitmap of most probable modes */
6179
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
6180
1.57M
{
6181
1.57M
    cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
6182
6183
1.57M
    mpms = 0;
6184
6.30M
    for (int i = 0; i < 3; ++i)
6185
4.73M
        mpms |= ((uint64_t)1 << mpmModes[i]);
6186
6187
1.57M
    return m_entropyCoder.bitsIntraModeNonMPM();
6188
1.57M
}
6189
6190
/* swap the current mode/cost with the mode with the highest cost in the
6191
 * current candidate list, if its cost is better (maintain a top N list) */
6192
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
6193
1.66M
{
6194
1.66M
    uint32_t maxIndex = 0;
6195
1.66M
    uint64_t maxValue = 0;
6196
6197
14.3M
    for (int i = 0; i < maxCandCount; i++)
6198
12.6M
    {
6199
12.6M
        if (maxValue < candCostList[i])
6200
1.80M
        {
6201
1.80M
            maxValue = candCostList[i];
6202
1.80M
            maxIndex = i;
6203
1.80M
        }
6204
12.6M
    }
6205
6206
1.66M
    if (cost < maxValue)
6207
1.60M
    {
6208
1.60M
        candCostList[maxIndex] = cost;
6209
1.60M
        candModeList[maxIndex] = mode;
6210
1.60M
    }
6211
1.66M
}
6212
6213
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
6214
683k
{
6215
683k
    CUData& cu = mode.cu;
6216
683k
    if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
6217
16.4k
    {
6218
16.4k
        if (cu.getQtRootCbf(0))
6219
735
        {
6220
735
            if (m_param->rdLevel >= 3)
6221
735
            {
6222
735
                mode.contexts.resetBits();
6223
735
                mode.contexts.codeDeltaQP(cu, 0);
6224
735
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
6225
735
                mode.totalBits += bits;
6226
735
                updateModeCost(mode);
6227
735
            }
6228
0
            else if (m_param->rdLevel <= 1)
6229
0
            {
6230
0
                mode.sa8dBits++;
6231
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
6232
0
            }
6233
0
            else
6234
0
            {
6235
0
                mode.totalBits++;
6236
0
                updateModeCost(mode);
6237
0
            }
6238
735
        }
6239
15.7k
        else
6240
15.7k
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
6241
16.4k
    }
6242
683k
}
6243
6244
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
6245
102k
{
6246
102k
    CUData& cu = mode.cu;
6247
6248
102k
    if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
6249
19.7k
    {
6250
19.7k
        bool hasResidual = false;
6251
6252
        /* Check if any sub-CU has a non-zero QP */
6253
948k
        for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
6254
928k
        {
6255
928k
            if (cu.getQtRootCbf(blkIdx))
6256
489
            {
6257
489
                hasResidual = true;
6258
489
                break;
6259
489
            }
6260
928k
        }
6261
19.7k
        if (hasResidual)
6262
489
        {
6263
489
            if (m_param->rdLevel >= 3)
6264
489
            {
6265
489
                mode.contexts.resetBits();
6266
489
                mode.contexts.codeDeltaQP(cu, 0);
6267
489
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
6268
489
                mode.totalBits += bits;
6269
489
                updateModeCost(mode);
6270
489
            }
6271
0
            else if (m_param->rdLevel <= 1)
6272
0
            {
6273
0
                mode.sa8dBits++;
6274
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
6275
0
            }
6276
0
            else
6277
0
            {
6278
0
                mode.totalBits++;
6279
0
                updateModeCost(mode);
6280
0
            }
6281
            /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
6282
            When the non-zero CBF sub-CU is found, stop */
6283
489
            cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
6284
489
        }
6285
19.2k
        else
6286
            /* No residual within this CU or subCU, so reset QP to RefQP */
6287
19.2k
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
6288
19.7k
    }
6289
102k
}