Coverage Report

Created: 2026-05-16 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/search.cpp
Line
Count
Source
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Steve Borho <steve@borho.org>
5
*          Min Chen <chenm003@163.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 2 of the License, or
10
* (at your option) any later version.
11
*
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
* GNU General Public License for more details.
16
*
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
*
21
* This program is also available under a commercial proprietary license.
22
* For more information, contact us at license @ x265.com.
23
*****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "picyuv.h"
28
#include "cudata.h"
29
30
#include "search.h"
31
#include "entropy.h"
32
#include "rdcost.h"
33
34
#include "analysis.h"  // TLD
35
#include "framedata.h"
36
#include "encoder.h"
37
38
using namespace X265_NS;
39
40
#if _MSC_VER
41
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
42
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
43
#pragma warning(disable: 4127) // conditional expression is constant
44
#endif
45
46
0
#define MVP_IDX_BITS 1
47
48
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
49
50
Search::Search()
51
19.6k
{
52
19.6k
    memset(m_rqt, 0, sizeof(m_rqt));
53
54
78.7k
    for (int i = 0; i < 3; i++)
55
59.0k
    {
56
59.0k
        m_qtTempTransformSkipFlag[i] = NULL;
57
59.0k
        m_qtTempCbf[i] = NULL;
58
59.0k
    }
59
60
19.6k
    m_numLayers = 0;
61
19.6k
    m_intraPred = NULL;
62
19.6k
    m_intraPredAngs = NULL;
63
19.6k
    m_fencScaled = NULL;
64
19.6k
    m_fencTransposed = NULL;
65
19.6k
    m_tsCoeff = NULL;
66
19.6k
    m_tsResidual = NULL;
67
19.6k
    m_tsRecon = NULL;
68
19.6k
    m_param = NULL;
69
19.6k
    m_slice = NULL;
70
19.6k
    m_frame = NULL;
71
19.6k
    m_maxTUDepth = -1;
72
19.6k
}
73
74
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
75
19.6k
{
76
19.6k
    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
77
19.6k
    m_param = &param;
78
19.6k
    m_bFrameParallel = param.frameNumThreads > 1;
79
19.6k
    m_numLayers = g_log2Size[param.maxCUSize] - 2;
80
#if ENABLE_SCC_EXT
81
    m_ibcEnabled = param.bEnableSCC;
82
#endif
83
84
19.6k
    m_rdCost.setPsyRdScale(param.psyRd);
85
19.6k
    m_rdCost.setSsimRd(param.bSsimRd);
86
19.6k
    m_me.init(param.internalCsp);
87
88
19.6k
    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
89
19.6k
    if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
90
0
        ok &= m_quant.allocNoiseReduction(param);
91
92
19.6k
    ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
93
94
    /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
95
     * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
96
19.6k
    m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
97
98
19.6k
    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
99
19.6k
    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
100
19.6k
    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
101
102
19.6k
    m_limitTU = 0;
103
19.6k
    if (m_param->limitTU)
104
0
    {
105
0
        if (m_param->limitTU == 1)
106
0
            m_limitTU = X265_TU_LIMIT_BFS;
107
0
        else if (m_param->limitTU == 2)
108
0
            m_limitTU = X265_TU_LIMIT_DFS;
109
0
        else if (m_param->limitTU == 3)
110
0
            m_limitTU = X265_TU_LIMIT_NEIGH;
111
0
        else if (m_param->limitTU == 4)
112
0
            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
113
0
    }
114
115
    /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
116
     * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
117
     * which are reconstructed at each depth are valid. At the end, the transform depth table
118
     * is walked and the coeff and recon at the correct depths are collected */
119
120
19.6k
    if (param.internalCsp != X265_CSP_I400)
121
19.6k
    {
122
106k
        for (uint32_t i = 0; i <= m_numLayers; i++)
123
86.9k
        {
124
86.9k
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
125
86.9k
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
126
86.9k
            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
127
86.9k
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
128
86.9k
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
129
86.9k
        }
130
19.6k
    }
131
0
    else
132
0
    {
133
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
134
0
        {
135
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
136
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
137
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
138
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
139
0
        }
140
0
    }
141
142
    /* the rest of these buffers are indexed per-depth */
143
86.9k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
144
67.2k
    {
145
67.2k
        int cuSize = param.maxCUSize >> i;
146
67.2k
        ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
147
67.2k
        ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
148
67.2k
        ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
149
67.2k
        ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
150
67.2k
    }
151
152
19.6k
    if (param.internalCsp != X265_CSP_I400)
153
19.6k
    {
154
19.6k
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
155
19.6k
        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
156
19.6k
        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
157
19.6k
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
158
19.6k
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
159
19.6k
        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
160
19.6k
    }
161
0
    else
162
0
    {
163
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
164
0
        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
165
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
166
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
167
0
    }
168
169
19.6k
    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
170
19.6k
    m_fencScaled = m_intraPred + 32 * 32;
171
19.6k
    m_fencTransposed = m_fencScaled + 32 * 32;
172
19.6k
    m_intraPredAngs = m_fencTransposed + 32 * 32;
173
174
19.6k
    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
175
19.6k
    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
176
19.6k
    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
177
178
#if ENABLE_SCC_EXT
179
    m_numBVs = 0;
180
    m_numBV16s = 0;
181
#endif
182
183
19.6k
    return ok;
184
185
0
fail:
186
0
    return false;
187
19.6k
}
188
189
Search::~Search()
190
19.6k
{
191
106k
    for (uint32_t i = 0; i <= m_numLayers; i++)
192
86.9k
    {
193
86.9k
        X265_FREE(m_rqt[i].coeffRQT[0]);
194
86.9k
        m_rqt[i].reconQtYuv.destroy();
195
86.9k
        m_rqt[i].resiQtYuv.destroy();
196
86.9k
    }
197
198
86.9k
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
199
67.2k
    {
200
67.2k
        m_rqt[i].tmpResiYuv.destroy();
201
67.2k
        m_rqt[i].tmpPredYuv.destroy();
202
67.2k
        m_rqt[i].bidirPredYuv[0].destroy();
203
67.2k
        m_rqt[i].bidirPredYuv[1].destroy();
204
67.2k
    }
205
206
19.6k
    X265_FREE(m_qtTempCbf[0]);
207
19.6k
    X265_FREE(m_qtTempTransformSkipFlag[0]);
208
19.6k
    X265_FREE(m_intraPred);
209
19.6k
    X265_FREE(m_tsCoeff);
210
19.6k
    X265_FREE(m_tsResidual);
211
19.6k
    X265_FREE(m_tsRecon);
212
19.6k
}
213
214
int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
215
24.6k
{
216
24.6k
    X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
217
218
24.6k
    m_me.setQP(qp);
219
24.6k
    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
220
221
24.6k
    int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
222
24.6k
    m_quant.setQPforQuant(ctu, quantQP);
223
24.6k
    return quantQP;
224
24.6k
}
225
226
void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData& cu, PicYuv* fencPic, int puOffset, PartSize part, int areaIdx, int finalIdx, bool isMVP , const int* neighborIdx)
227
0
{
228
#ifdef DETAILED_CU_STATS
229
    m_stats[cu.m_encData->m_frameEncoderID].countMotionEstimate++;
230
#endif
231
232
0
    int satdCost = 0;
233
0
    int numPredDir = slice->isInterP() ? 1 : 2;
234
0
    int searchRange = isMVP ? 32 : m_param->searchRange;
235
236
0
    MV mvp(0,0);
237
0
    MV mvzero(0,0);
238
239
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
240
0
    MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
241
242
0
    MotionData bestME[2];
243
0
    bestME[0].cost = MAX_UINT;
244
0
    bestME[1].cost = MAX_UINT;
245
246
0
    int numPart = cu.getNumPartInter(0);
247
0
    uint32_t lastMode = 0;
248
249
0
    int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
250
0
    int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
251
252
0
    int numMvc = 0;
253
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
254
0
    {
255
0
        PredictionUnit pu(cu, cuGeom, puIdx);
256
257
0
        int pos = finalIdx + puIdx * puOffset;
258
0
        int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;
259
260
0
        InterNeighbourMV neighbours[6];
261
0
        if(!isMVP)
262
0
           cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, neighbours);
263
264
0
        for (int list = 0; list < numPredDir; list++)
265
0
        {
266
0
            int numIdx = slice->m_numRefIdx[list];
267
0
            for (int ref = 0; ref < numIdx; ref++)
268
0
            {
269
0
                getBlkBits(part, slice->isInterP(), puIdx, lastMode, m_listSelBits);
270
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
271
0
                bits += getTUBits(ref, numIdx);
272
273
0
                MV mvmin, mvmax, outmv,mvp_lowres;;
274
0
                mvp = !isMVP ? m_areaBestMV[areaIdx][list][ref] : mvp;
275
276
0
                MV zeroMV[2] = {0,0};
277
0
                const MV* amvp = zeroMV;
278
0
                int mvpIdx = 0;
279
280
0
                bool bLowresMVP = false;
281
0
                if (!isMVP)
282
0
                {
283
0
                    for(int dir = MD_LEFT; dir <= MD_ABOVE_LEFT ; dir++)
284
0
                    {
285
0
                        int neighIdx = neighborIdx[dir];
286
0
                        if (neighIdx >= 0)
287
0
                        {
288
0
                            MEData& neighborData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + neighIdx];
289
0
                            for (int i = 0; i < 2; i++)
290
0
                            {
291
0
                                neighbours[dir].mv[i] = neighborData.mv[i];
292
0
                                neighbours[dir].refIdx[i] = neighborData.ref[i];
293
0
                            }
294
0
                            neighbours[dir].isAvailable = (neighborData.ref[0] >= 0 || neighborData.ref[1] >= 0);
295
0
                        }
296
0
                        else
297
0
                        {
298
0
                            for (int i = 0; i < 2; i++)
299
0
                                neighbours[dir].refIdx[i] = -1;
300
0
                            neighbours[dir].isAvailable = false;
301
0
                        }
302
0
                    }
303
304
0
                    numMvc = cu.getPMV(neighbours, list, ref, amvpCand[list][ref], mvc);
305
0
                    if (numMvc > 0)
306
0
                    {
307
0
                        amvp = amvpCand[list][ref];
308
0
                        mvpIdx = selectMVP(cu, pu, amvp, list, ref);
309
0
                        mvp = amvp[mvpIdx];                 
310
0
                    }
311
0
                    else if (slice->m_refFrameList[list][ref]->m_encData->m_slice->m_sliceType != I_SLICE)
312
0
                    {
313
0
                        MEData meData = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
314
315
0
                        bool bi = (meData.ref[0] >= 0 && meData.ref[1] >= 0);
316
0
                        bool uniL0 = (meData.ref[0] >= 0 && meData.ref[1] == REF_NOT_VALID);
317
0
                        bool uniL1 = (meData.ref[1] >= 0 && meData.ref[0] == REF_NOT_VALID);
318
319
0
                        if (uniL0)
320
0
                            mvp = meData.mv[0];
321
0
                        else if (uniL1)
322
0
                            mvp = meData.mv[1];
323
0
                        else if (bi)
324
0
                            mvp = meData.mv[list];
325
0
                    }
326
0
                }
327
328
0
                m_me.setMVP(mvp);
329
330
0
                if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad))
331
0
                {
332
0
                    uint32_t blockX = cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + (pu.width  >> 1);
333
0
                    uint32_t blockY = cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + (pu.height >> 1);
334
335
0
                    if (blockX < m_slice->m_sps->picWidthInLumaSamples && blockY < m_slice->m_sps->picHeightInLumaSamples)
336
0
                    {
337
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
338
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
339
0
                        if (lmv.notZero() && !layer)
340
0
                        {
341
0
                            mvc[numMvc++] = lmv;
342
0
                            bLowresMVP = true;
343
0
                        }
344
0
                        mvp_lowres = lmv;
345
0
                    }
346
0
                }
347
348
0
                PicYuv* recon = slice->m_mref[list][ref].reconPic;
349
0
                int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0);
350
351
0
                if (m_param->searchMethod == X265_SEA)
352
0
                {
353
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
354
0
                        m_me.integral[planes] = slice->m_refFrameList[list][ref]->m_encData->m_meIntegral[planes] + offset;
355
0
                }
356
357
0
                m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
358
0
                setSearchRange(cu, mvp, searchRange, mvmin, mvmax);
359
360
0
                if (isMVP)
361
0
                {
362
0
                    satdCost = m_me.diamondSearch(&slice->m_mref[list][ref], mvmin, mvmax, outmv);
363
0
                    m_areaBestMV[areaIdx][list][ref] = outmv;
364
0
                }
365
0
                else
366
0
                {
367
0
                    m_vertRestriction = slice->m_refPOCList[list][ref] == slice->m_poc;
368
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
369
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
370
371
0
                    if (bLowresMVP && mvp_lowres.notZero() && mvp_lowres != mvp)
372
0
                    {
373
0
                        MV outmv_lowres;
374
0
                        bLowresMVP = false;
375
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
376
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref],  mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange,outmv_lowres, m_param->maxSlices,
377
0
                            m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0): 0);
378
379
0
                        if (lowresMvCost < satdCost)
380
0
                        {
381
0
                            outmv = outmv_lowres;
382
0
                            satdCost = lowresMvCost;
383
0
                            bLowresMVP = true;
384
0
                        }
385
0
                    }
386
0
                }
387
388
0
                bits += m_me.bitcost(outmv);
389
0
                uint32_t mvCost = m_me.mvcost(outmv);
390
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
391
392
0
                if(!isMVP)
393
0
                {
394
0
                    if (bLowresMVP)
395
0
                        updateMVP(mvp, outmv, bits, cost, mvp_lowres);
396
397
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
398
0
                }
399
0
                if (cost < bestME[list].cost)
400
0
                {
401
0
                    bestME[list].mv = outmv;
402
0
                    bestME[list].mvp = mvp;
403
0
                    bestME[list].mvpIdx = 0;
404
0
                    bestME[list].cost = cost;
405
0
                    bestME[list].bits = bits;
406
0
                    bestME[list].mvCost = mvCost;
407
0
                    bestME[list].ref = ref;
408
0
                }
409
0
            }
410
0
        }
411
412
0
        if (isMVP)
413
0
            return;
414
415
        //Bi-Direction
416
0
        MotionData bidir[2];
417
0
        uint32_t bidirCost = MAX_UINT;
418
0
        int bidirBits = 0;
419
0
        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
420
421
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&
422
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && !isMVP)
423
0
        {
424
0
            bidir[0] = bestME[0];
425
0
            bidir[1] = bestME[1];
426
427
0
            if (m_me.bChromaSATD)
428
0
            {
429
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
430
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
431
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
432
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
433
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
434
435
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
436
0
                    m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
437
0
            }
438
0
            else
439
0
            {
440
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
441
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
442
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
443
444
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
445
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
446
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
447
0
                    bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
448
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
449
0
            }
450
451
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
452
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
453
454
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
455
0
            if (bTryZero)
456
0
            {
457
0
                MV mvmin, mvmax;
458
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
459
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
460
0
                mvmax.y += 2;
461
0
                mvmin <<= 2;
462
0
                mvmax <<= 2;
463
464
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
465
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
466
0
            }
467
0
            if (bTryZero)
468
0
            {
469
0
                if (m_me.bChromaSATD)
470
0
                {
471
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
472
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
473
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
474
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
475
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
476
477
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
478
0
                        m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
479
0
                }
480
0
                else
481
0
                {
482
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
483
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
484
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
485
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
486
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
487
0
                }
488
489
0
                MV mvp0 = bestME[0].mvp;
490
0
                int mvpIdx0 = bestME[0].mvpIdx;
491
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
492
493
0
                MV mvp1 = bestME[1].mvp;
494
0
                int mvpIdx1 = bestME[1].mvpIdx;
495
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
496
497
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
498
499
0
                if (cost < bidirCost)
500
0
                {
501
0
                    bidir[0].mv = mvzero;
502
0
                    bidir[1].mv = mvzero;
503
0
                    bidir[0].mvp = mvp0;
504
0
                    bidir[1].mvp = mvp1;
505
0
                    bidir[0].mvpIdx = mvpIdx0;
506
0
                    bidir[1].mvpIdx = mvpIdx1;
507
0
                    bidirCost = cost;
508
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
509
0
                }
510
0
            }
511
0
        }
512
0
        MEData& outME = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
513
514
0
        outME.ref[0] = REF_NOT_VALID;
515
0
        outME.ref[1] = REF_NOT_VALID;
516
517
0
        if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
518
0
        {
519
0
            lastMode = 2;
520
521
0
            outME.mv[0] = bidir[0].mv;
522
0
            outME.mv[1] = bidir[1].mv;
523
0
            outME.mvp[0] = bidir[0].mvp;
524
0
            outME.mvp[1] = bidir[1].mvp;
525
0
            outME.mvCost[0] = bestME[0].mvCost;
526
0
            outME.mvCost[1] = bestME[1].mvCost;
527
0
            outME.ref[0] = bestME[0].ref;
528
0
            outME.ref[1] = bestME[1].ref;
529
530
0
            outME.bits = bidirBits;
531
0
            outME.cost = bidirCost;
532
0
        }
533
0
        else if (bestME[0].cost <= bestME[1].cost)
534
0
        {
535
0
            lastMode = 0;
536
537
0
            outME.mv[0] = bestME[0].mv;
538
0
            outME.mvp[0] = bestME[0].mvp;
539
0
            outME.mvCost[0] = bestME[0].mvCost;
540
0
            outME.cost = bestME[0].cost;
541
0
            outME.bits = bestME[0].bits;
542
0
            outME.ref[0] = bestME[0].ref;
543
0
            outME.ref[1] = REF_NOT_VALID;
544
0
        }
545
0
        else
546
0
        {
547
0
            lastMode = 1;
548
549
0
            outME.mv[1] = bestME[1].mv;
550
0
            outME.mvp[1] = bestME[1].mvp;
551
0
            outME.mvCost[1] = bestME[1].mvCost;
552
0
            outME.cost = bestME[1].cost;
553
0
            outME.bits = bestME[1].bits;
554
0
            outME.ref[1] = bestME[1].ref;
555
0
            outME.ref[0] = REF_NOT_VALID;
556
0
        }
557
0
    }
558
0
}
559
560
#if CHECKED_BUILD || _DEBUG
561
void Search::invalidateContexts(int fromDepth)
562
{
563
    /* catch reads without previous writes */
564
    for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
565
    {
566
        m_rqt[d].cur.markInvalid();
567
        m_rqt[d].rqtTemp.markInvalid();
568
        m_rqt[d].rqtRoot.markInvalid();
569
        m_rqt[d].rqtTest.markInvalid();
570
    }
571
}
572
#else
573
91.5k
void Search::invalidateContexts(int) {}
574
#endif
575
576
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
577
8.44M
{
578
8.44M
    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
579
8.44M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
580
581
8.44M
    if (!(log2TrSize - m_hChromaShift < 2))
582
3.07M
    {
583
3.07M
        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
584
3.07M
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
585
3.07M
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
586
3.07M
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
587
3.07M
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
588
3.07M
    }
589
590
8.44M
    if (subdiv)
591
1.34M
    {
592
1.34M
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
593
6.71M
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
594
5.37M
            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
595
1.34M
    }
596
8.44M
}
597
598
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
599
6.18M
{
600
6.18M
    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
601
6.12M
        return;
602
603
59.4k
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
604
605
59.4k
    if (tuDepth < cu.m_tuDepth[absPartIdx])
606
10.1k
    {
607
10.1k
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
608
50.9k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
609
40.7k
            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
610
611
10.1k
        return;
612
10.1k
    }
613
614
49.2k
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
615
616
49.2k
    if (log2TrSizeC < 2)
617
30.0k
    {
618
30.0k
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
619
30.0k
        if (absPartIdx & 3)
620
22.5k
            return;
621
7.51k
        log2TrSizeC = 2;
622
7.51k
    }
623
624
26.7k
    uint32_t qtLayer = log2TrSize - 2;
625
626
26.7k
    if (m_csp != X265_CSP_I422)
627
23.5k
    {
628
23.5k
        uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
629
23.5k
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
630
23.5k
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
631
23.5k
        m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
632
23.5k
    }
633
3.16k
    else
634
3.16k
    {
635
3.16k
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
636
3.16k
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
637
3.16k
        uint32_t subTUSize = 1 << (log2TrSizeC * 2);
638
3.16k
        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
639
3.16k
        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
640
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
641
3.16k
        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
642
0
            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
643
3.16k
    }
644
26.7k
}
645
646
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
647
4.24M
{
648
4.24M
    CUData& cu = mode.cu;
649
4.24M
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
650
4.24M
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
651
4.24M
    uint32_t qtLayer    = log2TrSize - 2;
652
4.24M
    uint32_t sizeIdx    = log2TrSize - 2;
653
4.24M
    bool mightNotSplit  = log2TrSize <= depthRange[1];
654
4.24M
    bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
655
4.24M
    bool bEnableRDOQ  = !!m_param->rdoqLevel;
656
657
    /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
658
4.24M
    if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
659
0
    {
660
0
        mightNotSplit = false;
661
0
        mightSplit = true;
662
0
    }
663
664
4.24M
    Cost fullCost;
665
4.24M
    uint32_t bCBF = 0;
666
667
4.24M
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
668
4.24M
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
669
670
4.24M
    if (mightNotSplit)
671
4.23M
    {
672
4.23M
        if (mightSplit)
673
346k
            m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
674
675
4.23M
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
676
4.23M
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
677
4.23M
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
678
4.23M
        uint32_t stride   = mode.fencYuv->m_size;
679
680
        // init availability pattern
681
4.23M
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
682
4.23M
        IntraNeighbors intraNeighbors;
683
4.23M
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
684
4.23M
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
685
686
        // get prediction signal
687
4.23M
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
688
689
4.23M
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
690
4.23M
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
691
692
4.23M
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
693
4.23M
        coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
694
695
        // store original entropy coding status
696
4.23M
        if (bEnableRDOQ)
697
4.24M
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
698
4.23M
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
699
700
4.23M
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
701
4.23M
        if (numSig)
702
22.3k
        {
703
22.3k
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
704
22.3k
            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
705
22.3k
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
706
22.3k
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
707
22.3k
            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
708
22.3k
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
709
22.3k
        }
710
4.21M
        else
711
            // no coded residual, recon = pred
712
4.21M
            primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
713
714
4.23M
        bCBF = !!numSig << tuDepth;
715
4.23M
        cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
716
4.23M
        fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
717
718
4.23M
        m_entropyCoder.resetBits();
719
4.23M
        if (!absPartIdx)
720
1.59M
        {
721
1.59M
            if (!cu.m_slice->isIntra())
722
0
            {
723
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
724
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
725
0
                m_entropyCoder.codeSkipFlag(cu, 0);
726
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
727
0
            }
728
729
1.59M
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
730
1.59M
        }
731
4.23M
        if (cu.m_partSize[0] == SIZE_2Nx2N)
732
2.08M
        {
733
2.08M
            if (!absPartIdx)
734
1.04M
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
735
2.08M
        }
736
2.15M
        else
737
2.15M
        {
738
2.15M
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
739
2.15M
            if (!tuDepth)
740
0
            {
741
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
742
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
743
0
            }
744
2.15M
            else if (!(absPartIdx & (qNumParts - 1)))
745
2.15M
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
746
2.15M
        }
747
4.23M
        if (log2TrSize != depthRange[0])
748
703k
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
749
750
4.23M
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
751
752
4.23M
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
753
22.3k
            m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
754
755
4.23M
        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
756
757
4.23M
        if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
758
0
            fullCost.bits *= 4;
759
760
4.23M
        if (m_rdCost.m_psyRd)
761
4.23M
        {
762
4.23M
            fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
763
4.23M
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
764
4.23M
        }
765
18.4E
        else if(m_rdCost.m_ssimRd)
766
0
        {
767
0
            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
768
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
769
0
        }
770
18.4E
        else
771
18.4E
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
772
4.23M
    }
773
1.56k
    else
774
1.56k
        fullCost.rdcost = MAX_INT64;
775
776
4.24M
    if (mightSplit)
777
346k
    {
778
346k
        if (mightNotSplit)
779
346k
        {
780
346k
            m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
781
346k
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
782
346k
        }
783
784
        /* code split block */
785
346k
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
786
787
346k
        int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
788
346k
        if (m_param->bEnableTSkipFast)
789
0
            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
790
791
346k
        Cost splitCost;
792
346k
        uint32_t cbf = 0;
793
1.73M
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
794
1.38M
        {
795
1.38M
            if (checkTransformSkip)
796
0
                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
797
1.38M
            else
798
1.38M
                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
799
800
1.38M
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
801
1.38M
        }
802
346k
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
803
804
346k
        if (mightNotSplit && log2TrSize != depthRange[0])
805
346k
        {
806
            /* If we could have coded this TU depth, include cost of subdiv flag */
807
346k
            m_entropyCoder.resetBits();
808
346k
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
809
346k
            splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
810
811
346k
            if (m_rdCost.m_psyRd)
812
346k
                splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
813
6
            else if(m_rdCost.m_ssimRd)
814
0
                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
815
6
            else
816
6
                splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
817
346k
        }
818
819
346k
        if (splitCost.rdcost < fullCost.rdcost)
820
410
        {
821
410
            outCost.rdcost     += splitCost.rdcost;
822
410
            outCost.distortion += splitCost.distortion;
823
410
            outCost.bits       += splitCost.bits;
824
410
            outCost.energy     += splitCost.energy;
825
410
            return;
826
410
        }
827
345k
        else
828
345k
        {
829
            // recover entropy state of full-size TU encode
830
345k
            m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
831
832
            // recover transform index and Cbf values
833
345k
            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
834
345k
            cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
835
345k
            cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
836
345k
        }
837
346k
    }
838
839
    // set reconstruction for next intra prediction blocks if full TU prediction won
840
4.24M
    PicYuv*  reconPic = m_frame->m_reconPic[0];
841
4.24M
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
842
4.24M
    intptr_t picStride = reconPic->m_stride;
843
4.24M
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
844
845
4.24M
    outCost.rdcost     += fullCost.rdcost;
846
4.24M
    outCost.distortion += fullCost.distortion;
847
4.24M
    outCost.bits       += fullCost.bits;
848
4.24M
    outCost.energy     += fullCost.energy;
849
4.24M
}
850
851
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
852
0
{
853
0
    uint32_t fullDepth = cuGeom.depth + tuDepth;
854
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
855
0
    uint32_t tuSize = 1 << log2TrSize;
856
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
857
858
0
    X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
859
860
0
    CUData& cu = mode.cu;
861
0
    Yuv* predYuv = &mode.predYuv;
862
0
    const Yuv* fencYuv = mode.fencYuv;
863
864
0
    Cost fullCost;
865
0
    fullCost.rdcost = MAX_INT64;
866
0
    int      bTSkip = 0;
867
0
    uint32_t bCBF = 0;
868
869
0
    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
870
0
    pixel*   pred = predYuv->getLumaAddr(absPartIdx);
871
0
    int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
872
0
    uint32_t stride = fencYuv->m_size;
873
0
    uint32_t sizeIdx = log2TrSize - 2;
874
875
    // init availability pattern
876
0
    uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
877
0
    IntraNeighbors intraNeighbors;
878
0
    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
879
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
880
881
    // get prediction signal
882
0
    predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
883
884
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
885
886
0
    uint32_t qtLayer = log2TrSize - 2;
887
0
    uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
888
0
    coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
889
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
890
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
891
892
    // store original entropy coding status
893
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
894
895
0
    if (bEnableRDOQ)
896
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
897
898
0
    int checkTransformSkip = 1;
899
0
    for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
900
0
    {
901
0
        uint64_t tmpCost;
902
0
        uint32_t tmpEnergy = 0;
903
904
0
        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
905
0
        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
906
0
        bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
907
0
        uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
908
909
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
910
911
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
912
0
        if (numSig)
913
0
        {
914
0
            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
915
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
916
0
            bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
917
0
            bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
918
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
919
0
        }
920
0
        else if (useTSkip)
921
0
        {
922
            /* do not allow tskip if CBF=0, pretend we did not try tskip */
923
0
            checkTransformSkip = 0;
924
0
            break;
925
0
        }
926
0
        else
927
            // no residual coded, recon = pred
928
0
            primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
929
930
0
        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
931
932
0
        cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
933
0
        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
934
935
0
        if (useTSkip)
936
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
937
938
0
        m_entropyCoder.resetBits();
939
0
        if (!absPartIdx)
940
0
        {
941
0
            if (!cu.m_slice->isIntra())
942
0
            {
943
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
944
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
945
0
                m_entropyCoder.codeSkipFlag(cu, 0);
946
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
947
0
            }
948
949
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
950
0
        }
951
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
952
0
        {
953
0
            if (!absPartIdx)
954
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
955
0
        }
956
0
        else
957
0
        {
958
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
959
0
            if (!tuDepth)
960
0
            {
961
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
962
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
963
0
            }
964
0
            else if (!(absPartIdx & (qNumParts - 1)))
965
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
966
0
        }
967
0
        m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
968
969
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
970
971
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
972
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
973
974
0
        uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
975
976
0
        if (!useTSkip)
977
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
978
979
0
        if (m_rdCost.m_psyRd)
980
0
        {
981
0
            tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
982
0
            tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
983
0
        }
984
0
        else if(m_rdCost.m_ssimRd)
985
0
        {
986
0
            tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx);
987
0
            tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
988
0
        }
989
0
        else
990
0
            tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
991
992
0
        if (tmpCost < fullCost.rdcost)
993
0
        {
994
0
            bTSkip = useTSkip;
995
0
            bCBF = !!numSig;
996
0
            fullCost.rdcost = tmpCost;
997
0
            fullCost.distortion = tmpDist;
998
0
            fullCost.bits = tmpBits;
999
0
            fullCost.energy = tmpEnergy;
1000
0
        }
1001
0
    }
1002
1003
0
    if (bTSkip)
1004
0
    {
1005
0
        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
1006
0
        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
1007
0
    }
1008
0
    else if (checkTransformSkip)
1009
0
    {
1010
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
1011
0
        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
1012
0
        m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
1013
0
    }
1014
1015
    // set reconstruction for next intra prediction blocks
1016
0
    PicYuv*  reconPic = m_frame->m_reconPic[0];
1017
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1018
0
    intptr_t picStride = reconPic->m_stride;
1019
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
1020
1021
0
    outCost.rdcost += fullCost.rdcost;
1022
0
    outCost.distortion += fullCost.distortion;
1023
0
    outCost.bits += fullCost.bits;
1024
0
    outCost.energy += fullCost.energy;
1025
0
}
1026
1027
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
1028
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
1029
0
{
1030
0
    CUData& cu = mode.cu;
1031
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
1032
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1033
0
    bool     bCheckFull = log2TrSize <= depthRange[1];
1034
1035
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
1036
1037
    /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
1038
     * since we are not measuring RD cost */
1039
0
    if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
1040
0
        bCheckFull = false;
1041
1042
0
    if (bCheckFull)
1043
0
    {
1044
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
1045
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
1046
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
1047
0
        uint32_t stride   = mode.fencYuv->m_size;
1048
1049
        // init availability pattern
1050
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
1051
0
        IntraNeighbors intraNeighbors;
1052
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
1053
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
1054
1055
        // get prediction signal
1056
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
1057
1058
0
        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
1059
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
1060
1061
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
1062
0
        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
1063
1064
0
        uint32_t sizeIdx   = log2TrSize - 2;
1065
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1066
1067
0
        PicYuv*  reconPic = m_frame->m_reconPic[0];
1068
0
        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1069
0
        intptr_t picStride = reconPic->m_stride;
1070
1071
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
1072
0
        if (numSig)
1073
0
        {
1074
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
1075
0
            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
1076
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
1077
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
1078
0
            bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
1079
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
1080
0
            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
1081
0
        }
1082
0
        else
1083
0
        {
1084
0
            primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
1085
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
1086
0
        }
1087
0
    }
1088
0
    else
1089
0
    {
1090
0
        X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
1091
1092
        /* code split block */
1093
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1094
0
        uint32_t cbf = 0;
1095
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1096
0
        {
1097
0
            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
1098
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
1099
0
        }
1100
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
1101
0
    }
1102
0
}
1103
1104
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
1105
1.42M
{
1106
1.42M
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1107
1108
1.42M
    if (tuDepth == cu.m_tuDepth[absPartIdx])
1109
1.42M
    {
1110
1.42M
        uint32_t qtLayer    = log2TrSize - 2;
1111
1112
        // copy transform coefficients
1113
1.42M
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
1114
1.42M
        coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
1115
1.42M
        coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
1116
1.42M
        memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
1117
1118
        // copy reconstruction
1119
1.42M
        m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
1120
1.42M
    }
1121
464
    else
1122
464
    {
1123
464
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1124
2.10k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1125
1.64k
            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
1126
464
    }
1127
1.42M
}
1128
1129
inline void offsetCBFs(uint8_t subTUCBF[2])
1130
0
{
1131
0
    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
1132
0
    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
1133
0
    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
1134
0
}
1135
1136
/* 4:2:2 post-TU split processing */
1137
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
1138
0
{
1139
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1140
1141
0
    if (log2TrSize == 2)
1142
0
    {
1143
0
        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1144
0
        ++log2TrSize;
1145
0
    }
1146
1147
0
    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
1148
1149
    // move the CBFs down a level and set the parent CBF
1150
0
    uint8_t subTUCBF[2];
1151
0
    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
1152
0
    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
1153
0
    offsetCBFs(subTUCBF);
1154
1155
0
    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
1156
0
    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
1157
0
}
1158
1159
/* returns distortion */
1160
void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
1161
8.44M
{
1162
8.44M
    CUData& cu = mode.cu;
1163
8.44M
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1164
8.44M
    bool bEnableRDOQ = !!m_param->rdoqLevel;
1165
1166
8.44M
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1167
1.34M
    {
1168
1.34M
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1169
1.34M
        uint32_t splitCbfU = 0, splitCbfV = 0;
1170
6.71M
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1171
5.37M
        {
1172
5.37M
            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
1173
5.37M
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1174
5.37M
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1175
5.37M
        }
1176
1.34M
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1177
1.34M
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1178
1179
1.34M
        return;
1180
1.34M
    }
1181
1182
7.09M
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1183
7.09M
    uint32_t tuDepthC = tuDepth;
1184
7.09M
    if (log2TrSizeC < 2)
1185
5.36M
    {
1186
5.36M
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1187
5.36M
        if (absPartIdx & 3)
1188
4.02M
            return;
1189
1.34M
        log2TrSizeC = 2;
1190
1.34M
        tuDepthC--;
1191
1.34M
    }
1192
1193
3.07M
    if (bEnableRDOQ)
1194
3.07M
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
1195
1196
3.07M
    bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
1197
3.07M
    checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
1198
3.07M
    if (checkTransformSkip)
1199
0
    {
1200
0
        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
1201
0
        return;
1202
0
    }
1203
1204
3.07M
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1205
3.07M
    uint32_t qtLayer = log2TrSize - 2;
1206
3.07M
    uint32_t stride = mode.fencYuv->m_csize;
1207
3.07M
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1208
1209
3.07M
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1210
3.07M
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1211
1212
3.07M
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1213
3.07M
    do
1214
3.07M
    {
1215
3.07M
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1216
1217
3.07M
        IntraNeighbors intraNeighbors;
1218
3.07M
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1219
1220
9.22M
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1221
6.15M
        {
1222
6.15M
            TextType ttype = (TextType)chromaId;
1223
1224
6.15M
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1225
6.15M
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1226
6.15M
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1227
6.15M
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1228
6.15M
            coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1229
6.15M
            pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1230
6.15M
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
1231
6.15M
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1232
6.15M
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1233
6.15M
            intptr_t picStride = reconPic->m_strideC;
1234
1235
6.15M
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1236
6.15M
            if (chromaPredMode == DM_CHROMA_IDX)
1237
1.23M
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1238
6.15M
            if (m_csp == X265_CSP_I422)
1239
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1240
1241
            // init availability pattern
1242
6.15M
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1243
1244
            // get prediction signal
1245
6.15M
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1246
6.15M
            cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1247
1248
6.15M
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1249
1250
6.15M
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1251
6.15M
            if (numSig)
1252
23.5k
            {
1253
23.5k
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1254
23.5k
                bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1255
23.5k
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1256
23.5k
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1257
23.5k
                bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
1258
23.5k
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
1259
23.5k
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1260
23.5k
            }
1261
6.12M
            else
1262
6.12M
            {
1263
                // no coded residual, recon = pred
1264
6.12M
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
1265
6.12M
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1266
6.12M
            }
1267
1268
6.15M
            outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
1269
1270
6.15M
            if (m_rdCost.m_psyRd)
1271
6.15M
                outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1272
1.07k
            else if(m_rdCost.m_ssimRd)
1273
0
                outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1274
1275
6.15M
            primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
1276
6.15M
        }
1277
3.07M
    }
1278
3.07M
    while (tuIterator.isNextSection());
1279
1280
3.07M
    if (splitType == VERTICAL_SPLIT)
1281
0
    {
1282
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1283
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1284
0
    }
1285
3.07M
}
1286
1287
/* returns distortion */
1288
void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
1289
0
{
1290
0
    CUData& cu = mode.cu;
1291
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
1292
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1293
0
    const uint32_t log2TrSizeC = 2;
1294
0
    uint32_t qtLayer = log2TrSize - 2;
1295
1296
    /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
1297
     * so the entropy coder is not very accurate. The best we can do is return it in the same
1298
     * condition as it arrived, and to do all bit estimates from the same state. */
1299
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
1300
1301
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1302
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1303
1304
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1305
0
    do
1306
0
    {
1307
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1308
1309
0
        IntraNeighbors intraNeighbors;
1310
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1311
1312
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1313
0
        {
1314
0
            TextType ttype = (TextType)chromaId;
1315
1316
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1317
0
            pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1318
0
            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
1319
0
            uint32_t stride = mode.fencYuv->m_csize;
1320
0
            const uint32_t sizeIdxC = log2TrSizeC - 2;
1321
1322
0
            uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1323
0
            coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1324
0
            pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1325
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
1326
1327
            // init availability pattern
1328
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1329
1330
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1331
0
            if (chromaPredMode == DM_CHROMA_IDX)
1332
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1333
0
            if (m_csp == X265_CSP_I422)
1334
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1335
1336
            // get prediction signal
1337
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1338
1339
0
            uint64_t bCost = MAX_INT64;
1340
0
            sse_t bDist = 0;
1341
0
            uint32_t bCbf = 0;
1342
0
            uint32_t bEnergy = 0;
1343
0
            int      bTSkip = 0;
1344
1345
0
            int checkTransformSkip = 1;
1346
0
            for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
1347
0
            {
1348
0
                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
1349
0
                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
1350
0
                uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
1351
1352
0
                primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1353
1354
0
                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
1355
0
                if (numSig)
1356
0
                {
1357
0
                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
1358
0
                    bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
1359
0
                    bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1360
0
                    bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1361
0
                    bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
1362
0
                    primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
1363
0
                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1364
0
                }
1365
0
                else if (useTSkip)
1366
0
                {
1367
0
                    checkTransformSkip = 0;
1368
0
                    break;
1369
0
                }
1370
0
                else
1371
0
                {
1372
0
                    primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
1373
0
                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1374
0
                }
1375
0
                sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
1376
0
                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
1377
1378
0
                cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1379
1380
0
                uint32_t tmpBits = 0, tmpEnergy = 0;
1381
0
                if (numSig)
1382
0
                {
1383
0
                    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1384
0
                    m_entropyCoder.resetBits();
1385
0
                    m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
1386
0
                    tmpBits = m_entropyCoder.getNumberOfWrittenBits();
1387
0
                }
1388
1389
0
                uint64_t tmpCost;
1390
0
                if (m_rdCost.m_psyRd)
1391
0
                {
1392
0
                    tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1393
0
                    tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
1394
0
                }
1395
0
                else if(m_rdCost.m_ssimRd)
1396
0
                {
1397
0
                    tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1398
0
                    tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
1399
0
                }
1400
0
                else
1401
0
                    tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
1402
1403
0
                if (tmpCost < bCost)
1404
0
                {
1405
0
                    bCost = tmpCost;
1406
0
                    bDist = tmpDist;
1407
0
                    bTSkip = useTSkip;
1408
0
                    bCbf = !!numSig;
1409
0
                    bEnergy = tmpEnergy;
1410
0
                }
1411
0
            }
1412
1413
0
            if (bTSkip)
1414
0
            {
1415
0
                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
1416
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
1417
0
            }
1418
1419
0
            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1420
0
            cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1421
1422
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1423
0
            pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1424
0
            intptr_t picStride = reconPic->m_strideC;
1425
0
            primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
1426
1427
0
            outCost.distortion += bDist;
1428
0
            outCost.energy += bEnergy;
1429
0
        }
1430
0
    }
1431
0
    while (tuIterator.isNextSection());
1432
1433
0
    if (splitType == VERTICAL_SPLIT)
1434
0
    {
1435
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1436
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1437
0
    }
1438
1439
0
    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1440
0
}
1441
1442
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
1443
949k
{
1444
949k
    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1445
949k
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1446
949k
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1447
1448
949k
    if (tuDepthL == tuDepth || log2TrSizeC == 2)
1449
949k
    {
1450
        // copy transform coefficients
1451
949k
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1452
949k
        uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1453
1454
949k
        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
1455
949k
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1456
949k
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1457
949k
        coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1458
949k
        coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1459
949k
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1460
949k
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1461
1462
        // copy reconstruction
1463
949k
        m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1464
949k
    }
1465
249
    else
1466
249
    {
1467
249
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1468
1.31k
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1469
1.06k
            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1470
249
    }
1471
949k
}
1472
1473
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
1474
0
{
1475
0
    CUData& cu = mode.cu;
1476
0
    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
1477
1478
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1479
0
    {
1480
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1481
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
1482
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1483
0
        {
1484
0
            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
1485
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1486
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1487
0
        }
1488
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1489
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1490
1491
0
        return;
1492
0
    }
1493
1494
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1495
0
    uint32_t tuDepthC = tuDepth;
1496
0
    if (log2TrSizeC < 2)
1497
0
    {
1498
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1499
0
        if (absPartIdx & 3)
1500
0
            return;
1501
0
        log2TrSizeC = 2;
1502
0
        tuDepthC--;
1503
0
    }
1504
1505
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1506
0
    uint32_t stride = mode.fencYuv->m_csize;
1507
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1508
1509
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1510
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1511
1512
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1513
0
    do
1514
0
    {
1515
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1516
1517
0
        IntraNeighbors intraNeighbors;
1518
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1519
1520
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1521
0
        {
1522
0
            TextType ttype = (TextType)chromaId;
1523
1524
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1525
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1526
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1527
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1528
0
            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
1529
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1530
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1531
0
            intptr_t picStride = reconPic->m_strideC;
1532
1533
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1534
0
            if (chromaPredMode == DM_CHROMA_IDX)
1535
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1536
0
            if (m_csp == X265_CSP_I422)
1537
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1538
1539
            // init availability pattern
1540
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1541
1542
            // get prediction signal
1543
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1544
1545
0
            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1546
1547
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1548
1549
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1550
0
            if (numSig)
1551
0
            {
1552
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1553
0
                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
1554
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1555
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
1556
0
                bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
1557
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
1558
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1559
0
            }
1560
0
            else
1561
0
            {
1562
                // no coded residual, recon = pred
1563
0
                primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
1564
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1565
0
            }
1566
0
        }
1567
0
    }
1568
0
    while (tuIterator.isNextSection());
1569
1570
0
    if (splitType == VERTICAL_SPLIT)
1571
0
    {
1572
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1573
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1574
0
    }
1575
0
}
1576
1577
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
1578
614k
{
1579
614k
    CUData& cu = intraMode.cu;
1580
1581
614k
    cu.setPartSizeSubParts(partSize);
1582
614k
    cu.setPredModeSubParts(MODE_INTRA);
1583
1584
614k
    uint32_t tuDepthRange[2];
1585
614k
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1586
1587
614k
    intraMode.initCosts();
1588
614k
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
1589
614k
    if (m_csp != X265_CSP_I400)
1590
614k
    {
1591
614k
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
1592
614k
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
1593
614k
    }
1594
18.4E
    else
1595
18.4E
        intraMode.distortion += intraMode.lumaDistortion;
1596
614k
    cu.m_distortion[0] = intraMode.distortion;
1597
614k
    m_entropyCoder.resetBits();
1598
614k
    if (m_slice->m_pps->bTransquantBypassEnabled)
1599
162k
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1600
1601
614k
    int skipFlagBits = 0;
1602
614k
    if (!m_slice->isIntra())
1603
0
    {
1604
0
        m_entropyCoder.codeSkipFlag(cu, 0);
1605
0
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1606
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
1607
0
    }
1608
1609
614k
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1610
614k
    m_entropyCoder.codePredInfo(cu, 0);
1611
614k
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1612
1613
614k
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1614
614k
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1615
614k
    m_entropyCoder.store(intraMode.contexts);
1616
614k
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1617
614k
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1618
614k
    const Yuv* fencYuv = intraMode.fencYuv;
1619
614k
    if (m_rdCost.m_psyRd)
1620
614k
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1621
18.4E
    else if(m_rdCost.m_ssimRd)
1622
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1623
1624
614k
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1625
1626
614k
    updateModeCost(intraMode);
1627
614k
    checkDQP(intraMode, cuGeom);
1628
1629
#if ENABLE_SCC_EXT
1630
    if (m_param->bEnableSCC)
1631
        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
1632
#endif
1633
614k
}
1634
1635
/* Note that this function does not save the best intra prediction, it must
1636
 * be generated later. It records the best mode in the cu */
1637
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1638
0
{
1639
0
    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1640
1641
0
    CUData& cu = intraMode.cu;
1642
0
    uint32_t depth = cuGeom.depth;
1643
1644
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
1645
0
    cu.setPredModeSubParts(MODE_INTRA);
1646
1647
0
    const uint32_t initTuDepth = 0;
1648
0
    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
1649
0
    uint32_t tuSize = 1 << log2TrSize;
1650
0
    const uint32_t absPartIdx = 0;
1651
1652
    // Reference sample smoothing
1653
0
    IntraNeighbors intraNeighbors;
1654
0
    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1655
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1656
1657
0
    const pixel* fenc = intraMode.fencYuv->m_buf[0];
1658
0
    uint32_t stride = intraMode.fencYuv->m_size;
1659
1660
0
    int sad, bsad;
1661
0
    uint32_t bits, bbits, mode, bmode;
1662
0
    uint64_t cost, bcost;
1663
1664
    // 33 Angle modes once
1665
0
    int scaleTuSize = tuSize;
1666
0
    int scaleStride = stride;
1667
0
    int costShift = 0;
1668
0
    int sizeIdx = log2TrSize - 2;
1669
1670
0
    if (tuSize > 32)
1671
0
    {
1672
        // CU is 64x64, we scale to 32x32 and adjust required parameters
1673
0
        primitives.scale2D_64to32(m_fencScaled, fenc, stride);
1674
0
        fenc = m_fencScaled;
1675
1676
0
        pixel nScale[129];
1677
0
        intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
1678
0
        primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
1679
1680
        // we do not estimate filtering for downscaled samples
1681
0
        memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));   // Top & Left pixels
1682
0
        memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
1683
1684
0
        scaleTuSize = 32;
1685
0
        scaleStride = 32;
1686
0
        costShift = 2;
1687
0
        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1688
0
    }
1689
1690
0
    pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1691
0
    int predsize = scaleTuSize * scaleTuSize;
1692
1693
0
    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1694
1695
    /* there are three cost tiers for intra modes:
1696
     *  pred[0]          - mode probable, least cost
1697
     *  pred[1], pred[2] - less probable, slightly more cost
1698
     *  non-mpm modes    - all cost the same (rbits) */
1699
0
    uint64_t mpms;
1700
0
    uint32_t mpmModes[3];
1701
0
    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1702
1703
    // DC
1704
0
    primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1705
0
    bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1706
0
    bmode = mode = DC_IDX;
1707
0
    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1708
0
    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1709
1710
    // PLANAR
1711
0
    pixel* planar = intraNeighbourBuf[0];
1712
0
    if (tuSize & (8 | 16 | 32))
1713
0
        planar = intraNeighbourBuf[1];
1714
1715
0
    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
1716
0
    sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1717
0
    mode = PLANAR_IDX;
1718
0
    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1719
0
    cost = m_rdCost.calcRdSADCost(sad, bits);
1720
0
    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1721
1722
0
    bool allangs = true;
1723
0
    if (primitives.cu[sizeIdx].intra_pred_allangs)
1724
0
    {
1725
0
        primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1726
0
        primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
1727
0
    }
1728
0
    else
1729
0
        allangs = false;
1730
1731
0
#define TRY_ANGLE(angle) \
1732
0
    if (allangs) { \
1733
0
        if (angle < 18) \
1734
0
            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1735
0
        else \
1736
0
            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1737
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1738
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1739
0
    } else { \
1740
0
        int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
1741
0
        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
1742
0
        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
1743
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1744
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1745
0
    }
1746
1747
0
    if (m_param->bEnableFastIntra)
1748
0
    {
1749
0
        int asad = 0;
1750
0
        uint32_t lowmode, highmode, amode = 5, abits = 0;
1751
0
        uint64_t acost = MAX_INT64;
1752
1753
        /* pick the best angle, sampling at distance of 5 */
1754
0
        for (mode = 5; mode < 35; mode += 5)
1755
0
        {
1756
0
            TRY_ANGLE(mode);
1757
0
            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1758
0
        }
1759
1760
        /* refine best angle at distance 2, then distance 1 */
1761
0
        for (uint32_t dist = 2; dist >= 1; dist--)
1762
0
        {
1763
0
            lowmode = amode - dist;
1764
0
            highmode = amode + dist;
1765
1766
0
            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1767
0
            TRY_ANGLE(lowmode);
1768
0
            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1769
1770
0
            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1771
0
            TRY_ANGLE(highmode);
1772
0
            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1773
0
        }
1774
1775
0
        if (amode == 33)
1776
0
        {
1777
0
            TRY_ANGLE(34);
1778
0
            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1779
0
        }
1780
1781
0
        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1782
0
    }
1783
0
    else // calculate and search all intra prediction angles for lowest cost
1784
0
    {
1785
0
        for (mode = 2; mode < 35; mode++)
1786
0
        {
1787
0
            TRY_ANGLE(mode);
1788
0
            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1789
0
        }
1790
0
    }
1791
1792
0
    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1793
0
    intraMode.initCosts();
1794
0
    intraMode.totalBits = bbits;
1795
0
    intraMode.distortion = bsad;
1796
0
    intraMode.sa8dCost = bcost;
1797
0
    intraMode.sa8dBits = bbits;
1798
0
}
1799
1800
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1801
0
{
1802
0
    ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1803
1804
0
    CUData& cu = intraMode.cu;
1805
0
    Yuv* reconYuv = &intraMode.reconYuv;
1806
1807
0
    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1808
0
    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1809
1810
0
    uint32_t tuDepthRange[2];
1811
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1812
1813
0
    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1814
1815
0
    Cost icosts;
1816
0
    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1817
0
    extractIntraResultQT(cu, *reconYuv, 0, 0);
1818
1819
0
    intraMode.lumaDistortion = icosts.distortion;
1820
0
    if (m_csp != X265_CSP_I400)
1821
0
    {
1822
0
        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
1823
0
        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
1824
0
    }
1825
0
    else
1826
0
        intraMode.distortion = intraMode.lumaDistortion;
1827
1828
0
    m_entropyCoder.resetBits();
1829
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1830
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1831
0
    m_entropyCoder.codeSkipFlag(cu, 0);
1832
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1833
0
    m_entropyCoder.codePredMode(cu.m_predMode[0]);
1834
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1835
0
    m_entropyCoder.codePredInfo(cu, 0);
1836
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1837
1838
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1839
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1840
1841
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1842
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1843
0
    const Yuv* fencYuv = intraMode.fencYuv;
1844
0
    if (m_rdCost.m_psyRd)
1845
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1846
0
    else if(m_rdCost.m_ssimRd)
1847
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1848
1849
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1850
0
    m_entropyCoder.store(intraMode.contexts);
1851
0
    updateModeCost(intraMode);
1852
0
    checkDQP(intraMode, cuGeom);
1853
0
}
1854
1855
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
1856
614k
{
1857
614k
    CUData& cu = intraMode.cu;
1858
614k
    Yuv* reconYuv = &intraMode.reconYuv;
1859
614k
    Yuv* predYuv = &intraMode.predYuv;
1860
614k
    const Yuv* fencYuv = intraMode.fencYuv;
1861
1862
614k
    uint32_t depth        = cuGeom.depth;
1863
614k
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
1864
614k
    uint32_t numPU        = 1 << (2 * initTuDepth);
1865
614k
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
1866
614k
    uint32_t tuSize       = 1 << log2TrSize;
1867
614k
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1868
614k
    uint32_t sizeIdx      = log2TrSize - 2;
1869
614k
    uint32_t absPartIdx   = 0;
1870
614k
    sse_t totalDistortion = 0;
1871
1872
614k
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1873
1874
    // loop over partitions
1875
2.03M
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1876
1.41M
    {
1877
1.41M
        uint32_t bmode = 0;
1878
1879
1.41M
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
1880
0
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
1881
1.41M
        else
1882
1.41M
        {
1883
1.41M
            uint64_t candCostList[MAX_RD_INTRA_MODES];
1884
1.41M
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
1885
1.41M
            uint64_t bcost;
1886
1.41M
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1887
1888
1.41M
            {
1889
1.41M
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1890
1891
                // Reference sample smoothing
1892
1.41M
                IntraNeighbors intraNeighbors;
1893
1.41M
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1894
1.41M
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1895
1896
                // determine set of modes to be tested (using prediction signal only)
1897
1.41M
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1898
1.41M
                uint32_t stride = predYuv->m_size;
1899
1900
1.41M
                int scaleTuSize = tuSize;
1901
1.41M
                int scaleStride = stride;
1902
1.41M
                int costShift = 0;
1903
1904
1.41M
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1905
1906
                /* there are three cost tiers for intra modes:
1907
                *  pred[0]          - mode probable, least cost
1908
                *  pred[1], pred[2] - less probable, slightly more cost
1909
                *  non-mpm modes    - all cost the same (rbits) */
1910
1.41M
                uint64_t mpms;
1911
1.41M
                uint32_t mpmModes[3];
1912
1.41M
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1913
1914
1.41M
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1915
1.41M
                uint64_t modeCosts[35];
1916
1917
                // DC
1918
1.41M
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1919
1.41M
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
1920
1.41M
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1921
1.41M
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1922
1923
                // PLANAR
1924
1.41M
                pixel* planar = intraNeighbourBuf[0];
1925
1.41M
                if (tuSize >= 8 && tuSize <= 32)
1926
346k
                    planar = intraNeighbourBuf[1];
1927
1928
1.41M
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
1929
1.41M
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
1930
1.41M
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1931
1.41M
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1932
1.41M
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1933
1934
                // angular predictions
1935
1.41M
                if (primitives.cu[sizeIdx].intra_pred_allangs)
1936
0
                {
1937
0
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1938
0
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
1939
0
                    for (int mode = 2; mode < 35; mode++)
1940
0
                    {
1941
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1942
0
                        if (mode < 18)
1943
0
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1944
0
                        else
1945
0
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1946
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1947
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1948
0
                    }
1949
0
                }
1950
1.41M
                else
1951
1.41M
                {
1952
47.9M
                    for (int mode = 2; mode < 35; mode++)
1953
46.5M
                    {
1954
46.5M
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1955
46.5M
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
1956
46.5M
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
1957
46.5M
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
1958
46.5M
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1959
46.5M
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1960
46.5M
                    }
1961
1.41M
                }
1962
1963
                /* Find the top maxCandCount candidate modes with cost within 25% of best
1964
                * or among the most probable modes. maxCandCount is derived from the
1965
                * rdLevel and depth. In general we want to try more modes at slower RD
1966
                * levels and at higher depths */
1967
12.2M
                for (int i = 0; i < maxCandCount; i++)
1968
10.8M
                    candCostList[i] = MAX_INT64;
1969
1970
1.41M
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
1971
51.0M
                for (int mode = 0; mode < 35; mode++)
1972
49.6M
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
1973
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
1974
1.50M
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1975
1.41M
            }
1976
1977
            /* measure best candidates using simple RDO (no TU splits) */
1978
1.41M
            bcost = MAX_INT64;
1979
2.85M
            for (int i = 0; i < maxCandCount; i++)
1980
2.85M
            {
1981
2.85M
                if (candCostList[i] == MAX_INT64)
1982
1.41M
                    break;
1983
1984
1.43M
                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1985
1986
1.43M
                m_entropyCoder.load(m_rqt[depth].cur);
1987
1.43M
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1988
1989
1.43M
                Cost icosts;
1990
1.43M
                if (checkTransformSkip)
1991
0
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1992
1.43M
                else
1993
1.43M
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1994
1.43M
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1995
1.43M
            }
1996
1.41M
        }
1997
1998
1.41M
        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1999
2000
        /* remeasure best mode, allowing TU splits */
2001
1.41M
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
2002
1.41M
        m_entropyCoder.load(m_rqt[depth].cur);
2003
2004
1.41M
        Cost icosts;
2005
1.41M
        if (checkTransformSkip)
2006
0
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
2007
1.41M
        else
2008
1.41M
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
2009
1.41M
        totalDistortion += icosts.distortion;
2010
2011
1.41M
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
2012
2013
        // set reconstruction for next intra prediction blocks
2014
1.41M
        if (puIdx != numPU - 1)
2015
804k
        {
2016
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
2017
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
2018
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
2019
             * that the contexts should be tracked through each PU */
2020
804k
            PicYuv*  reconPic = m_frame->m_reconPic[0];
2021
804k
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
2022
804k
            uint32_t dststride = reconPic->m_stride;
2023
804k
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
2024
804k
            uint32_t srcstride = reconYuv->m_size;
2025
804k
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
2026
804k
        }
2027
1.41M
    }
2028
2029
614k
    if (numPU > 1)
2030
268k
    {
2031
268k
        uint32_t combCbfY = 0;
2032
1.34M
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2033
1.07M
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
2034
2035
268k
        cu.m_cbf[0][0] |= combCbfY;
2036
268k
    }
2037
2038
    // TODO: remove this
2039
614k
    m_entropyCoder.load(m_rqt[depth].cur);
2040
2041
614k
    return totalDistortion;
2042
614k
}
2043
2044
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
2045
0
{
2046
0
    CUData& cu = intraMode.cu;
2047
0
    const Yuv* fencYuv = intraMode.fencYuv;
2048
0
    Yuv* predYuv = &intraMode.predYuv;
2049
2050
0
    uint32_t bestMode  = 0;
2051
0
    uint64_t bestCost  = MAX_INT64;
2052
0
    uint32_t modeList[NUM_CHROMA_MODE];
2053
2054
0
    uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
2055
0
    uint32_t tuSize = 1 << log2TrSizeC;
2056
0
    uint32_t tuDepth = 0;
2057
0
    int32_t costShift = 0;
2058
2059
0
    if (tuSize > 32)
2060
0
    {
2061
0
        tuDepth = 1;
2062
0
        costShift = 2;
2063
0
        log2TrSizeC = 5;
2064
0
    }
2065
2066
0
    IntraNeighbors intraNeighbors;
2067
0
    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
2068
0
    cu.getAllowedChromaDir(0, modeList);
2069
2070
    // check chroma modes
2071
0
    for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
2072
0
    {
2073
0
        uint32_t chromaPredMode = modeList[mode];
2074
0
        if (chromaPredMode == DM_CHROMA_IDX)
2075
0
            chromaPredMode = cu.m_lumaIntraDir[0];
2076
0
        if (m_csp == X265_CSP_I422)
2077
0
            chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
2078
2079
0
        uint64_t cost = 0;
2080
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2081
0
        {
2082
0
            const pixel* fenc = fencYuv->m_buf[chromaId];
2083
0
            pixel* pred = predYuv->m_buf[chromaId];
2084
0
            Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
2085
            // get prediction signal
2086
0
            predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
2087
0
            cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
2088
0
        }
2089
2090
0
        if (cost < bestCost)
2091
0
        {
2092
0
            bestCost = cost;
2093
0
            bestMode = modeList[mode];
2094
0
        }
2095
0
    }
2096
2097
0
    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
2098
0
}
2099
2100
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
2101
614k
{
2102
614k
    CUData& cu = intraMode.cu;
2103
614k
    Yuv& reconYuv = intraMode.reconYuv;
2104
2105
614k
    uint32_t depth       = cuGeom.depth;
2106
614k
    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
2107
614k
    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
2108
614k
    uint32_t absPartStep = cuGeom.numPartitions;
2109
614k
    sse_t totalDistortion = 0;
2110
2111
614k
    int size = partitionFromLog2Size(log2TrSize);
2112
2113
614k
    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
2114
2115
614k
    do
2116
614k
    {
2117
614k
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2118
2119
614k
        uint32_t bestMode = 0;
2120
614k
        sse_t bestDist = 0;
2121
614k
        uint64_t bestCost = MAX_INT64;
2122
2123
        // init mode list
2124
614k
        uint32_t minMode = 0;
2125
614k
        uint32_t maxMode = NUM_CHROMA_MODE;
2126
614k
        uint32_t modeList[NUM_CHROMA_MODE];
2127
2128
614k
        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
2129
0
        {
2130
0
            for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
2131
0
                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
2132
0
            maxMode = 1;
2133
0
        }
2134
614k
        else
2135
614k
            cu.getAllowedChromaDir(absPartIdxC, modeList);
2136
2137
614k
        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
2138
0
        {
2139
0
            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
2140
0
                modeList[l] = modeList[0];
2141
0
            maxMode = 1;
2142
0
        }
2143
        // check chroma modes
2144
3.68M
        for (uint32_t mode = minMode; mode < maxMode; mode++)
2145
3.07M
        {
2146
            // restore context models
2147
3.07M
            m_entropyCoder.load(m_rqt[depth].cur);
2148
2149
3.07M
            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
2150
3.07M
            Cost outCost;
2151
3.07M
            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
2152
2153
3.07M
            if (m_slice->m_pps->bTransformSkipEnabled)
2154
0
                m_entropyCoder.load(m_rqt[depth].cur);
2155
2156
3.07M
            m_entropyCoder.resetBits();
2157
            // chroma prediction mode
2158
3.07M
            if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
2159
3.07M
            {
2160
3.07M
                if (!absPartIdxC)
2161
3.07M
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
2162
3.07M
            }
2163
18.4E
            else
2164
18.4E
            {
2165
18.4E
                uint32_t qNumParts = cuGeom.numPartitions >> 2;
2166
18.4E
                if (!(absPartIdxC & (qNumParts - 1)))
2167
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
2168
18.4E
            }
2169
2170
3.07M
            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
2171
3.07M
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
2172
3.07M
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
2173
3.07M
            uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
2174
3.07M
            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
2175
1.59k
                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
2176
2177
3.07M
            if (cost < bestCost)
2178
948k
            {
2179
948k
                bestCost = cost;
2180
948k
                bestDist = outCost.distortion;
2181
948k
                bestMode = modeList[mode];
2182
948k
                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
2183
948k
                memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2184
948k
                memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2185
948k
                memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2186
948k
                memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2187
948k
            }
2188
3.07M
        }
2189
2190
614k
        if (!tuIterator.isLastSection())
2191
0
        {
2192
0
            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
2193
0
            PicYuv*  reconPic  = m_frame->m_reconPic[0];
2194
0
            uint32_t dststride = reconPic->m_strideC;
2195
0
            const pixel* src;
2196
0
            pixel* dst;
2197
2198
0
            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
2199
0
            src = reconYuv.getCbAddr(absPartIdxC);
2200
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
2201
2202
0
            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
2203
0
            src = reconYuv.getCrAddr(absPartIdxC);
2204
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
2205
0
        }
2206
2207
614k
        memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
2208
614k
        memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
2209
614k
        memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
2210
614k
        memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
2211
614k
        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
2212
614k
        totalDistortion += bestDist;
2213
614k
    }
2214
614k
    while (tuIterator.isNextSection());
2215
2216
614k
    if (initTuDepth != 0)
2217
0
    {
2218
0
        uint32_t combCbfU = 0;
2219
0
        uint32_t combCbfV = 0;
2220
0
        uint32_t qNumParts = tuIterator.absPartIdxStep;
2221
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2222
0
        {
2223
0
            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
2224
0
            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
2225
0
        }
2226
2227
0
        cu.m_cbf[1][0] |= combCbfU;
2228
0
        cu.m_cbf[2][0] |= combCbfV;
2229
0
    }
2230
2231
    /* TODO: remove this */
2232
614k
    m_entropyCoder.load(m_rqt[depth].cur);
2233
614k
    return totalDistortion;
2234
614k
}
2235
2236
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
2237
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
2238
0
{
2239
0
    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
2240
2241
0
    MVField  candMvField[MRG_MAX_NUM_CANDS][2];
2242
0
    uint8_t  candDir[MRG_MAX_NUM_CANDS];
2243
0
    uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
2244
#if ENABLE_SCC_EXT
2245
    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
2246
#else
2247
0
    if (cu.isBipredRestriction())
2248
0
    {
2249
        /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2250
0
        for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
2251
0
        {
2252
0
            if (candDir[mergeCand] == 3)
2253
0
            {
2254
0
                candDir[mergeCand] = 1;
2255
0
                candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
2256
0
            }
2257
0
        }
2258
0
    }
2259
0
#endif
2260
2261
0
    Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2262
2263
0
    uint32_t outCost = MAX_UINT;
2264
0
    for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
2265
0
    {
2266
        /* Prevent TMVP candidates from using unavailable reference pixels */
2267
0
        if (m_bFrameParallel)
2268
0
        {
2269
            // Parallel slices bound check
2270
0
            if (m_param->maxSlices > 1)
2271
0
            {
2272
0
                if (cu.m_bFirstRowInSlice &
2273
0
                    ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
2274
0
                    continue;
2275
2276
                // Last row in slice can't reference beyond bound since it is another slice area
2277
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2278
0
                if (cu.m_bLastRowInSlice &&
2279
0
                    ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
2280
0
                    continue;
2281
0
            }
2282
2283
0
            if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2284
0
                candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
2285
0
                continue;
2286
0
        }
2287
2288
#if ENABLE_SCC_EXT
2289
        if ((candDir[mergeCand] == 1 || candDir[mergeCand] == 3) && (m_slice->m_refPOCList[0][candMvField[mergeCand][0].refIdx] == m_slice->m_poc))
2290
        {
2291
            continue;
2292
        }
2293
#endif
2294
0
        cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
2295
0
        cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
2296
0
        cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
2297
0
        cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
2298
2299
0
        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
2300
2301
0
        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
2302
0
        if (m_me.bChromaSATD)
2303
0
            costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
2304
2305
0
        uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
2306
0
        costCand = costCand + m_rdCost.getCost(bitsCand);
2307
0
        if (costCand < outCost)
2308
0
        {
2309
0
            outCost = costCand;
2310
0
            m.bits = bitsCand;
2311
0
            m.index = mergeCand;
2312
0
        }
2313
0
    }
2314
2315
0
    m.mvField[0] = candMvField[m.index][0];
2316
0
    m.mvField[1] = candMvField[m.index][1];
2317
0
    m.dir = candDir[m.index];
2318
2319
0
    return outCost;
2320
0
}
2321
2322
/* find the lowres motion vector from lookahead in middle of current PU */
2323
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
2324
0
{
2325
0
    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
2326
0
    if (diffPoc > m_param->bframes + 1)
2327
        /* poc difference is out of range for lookahead */
2328
0
        return 0;
2329
2330
0
    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc];
2331
0
    if (mvs[0].x == 0x7FFF)
2332
        /* this motion search was not estimated by lookahead */
2333
0
        return 0;
2334
2335
0
    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
2336
0
    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
2337
0
    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
2338
2339
0
    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
2340
0
    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
2341
2342
0
    return mvs[idx] << 1; /* scale up lowres mv */
2343
0
}
2344
2345
/* Pick between the two AMVP candidates which is the best one to use as
2346
 * MVP for the motion search, based on SAD cost */
2347
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
2348
0
{
2349
0
    if (amvp[0] == amvp[1])
2350
0
        return 0;
2351
2352
0
    Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
2353
0
    uint32_t costs[AMVP_NUM_CANDS];
2354
2355
0
    for (int i = 0; i < AMVP_NUM_CANDS; i++)
2356
0
    {
2357
0
        MV mvCand = amvp[i];
2358
2359
        // NOTE: skip mvCand if Y is > merange and -FN>1
2360
0
        if (m_bFrameParallel)
2361
0
        {
2362
0
            costs[i] = m_me.COST_MAX;
2363
2364
0
            if (mvCand.y >= (m_param->searchRange + 1) * 4)
2365
0
                continue;
2366
2367
0
            if ((m_param->maxSlices > 1) &
2368
0
                ((mvCand.y < m_sliceMinY)
2369
0
              |  (mvCand.y > m_sliceMaxY)))
2370
0
                continue;
2371
0
        }
2372
0
        cu.clipMv(mvCand);
2373
#if ENABLE_SCC_EXT
2374
        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx[0] - 1)
2375
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameList[list][ref]->m_reconPic[1], mvCand);
2376
        else
2377
#endif
2378
0
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
2379
0
        costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
2380
0
    }
2381
2382
0
    return (costs[0] <= costs[1]) ? 0 : 1;
2383
0
}
2384
2385
void Search::PME::processTasks(int workerThreadId)
2386
0
{
2387
#if DETAILED_CU_STATS
2388
    int fe = mode.cu.m_encData->m_frameEncoderID;
2389
    master.m_stats[fe].countPMETasks++;
2390
    ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
2391
#endif
2392
0
    ProfileScopeEvent(pme);
2393
0
    master.processPME(*this, master.m_tld[workerThreadId].analysis);
2394
0
}
2395
2396
void Search::processPME(PME& pme, Search& slave)
2397
0
{
2398
    /* acquire a motion estimation job, else exit early */
2399
0
    int meId;
2400
0
    pme.m_lock.acquire();
2401
0
    if (pme.m_jobTotal > pme.m_jobAcquired)
2402
0
    {
2403
0
        meId = pme.m_jobAcquired++;
2404
0
        pme.m_lock.release();
2405
0
    }
2406
0
    else
2407
0
    {
2408
0
        pme.m_lock.release();
2409
0
        return;
2410
0
    }
2411
2412
    /* Setup slave Search instance for ME for master's CU */
2413
0
    if (&slave != this)
2414
0
    {
2415
0
        slave.m_slice = m_slice;
2416
0
        slave.m_frame = m_frame;
2417
0
        slave.m_param = m_param;
2418
0
        slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
2419
0
        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
2420
0
        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
2421
0
    }
2422
2423
    /* Perform ME, repeat until no more work is available */
2424
0
    do
2425
0
    {
2426
0
        if (meId < pme.m_jobs.refCnt[0])
2427
0
        {
2428
0
            int refIdx = pme.m_jobs.ref[0][meId]; //L0
2429
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
2430
0
        }
2431
0
        else
2432
0
        {
2433
0
            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
2434
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
2435
0
        }
2436
2437
0
        meId = -1;
2438
0
        pme.m_lock.acquire();
2439
0
        if (pme.m_jobTotal > pme.m_jobAcquired)
2440
0
            meId = pme.m_jobAcquired++;
2441
0
        pme.m_lock.release();
2442
0
    }
2443
0
    while (meId >= 0);
2444
0
}
2445
2446
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
2447
0
{
2448
0
    uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
2449
0
    int numIdx = m_slice->m_numRefIdx[list];
2450
#if ENABLE_SCC_EXT
2451
    if (!list && m_ibcEnabled)
2452
        numIdx--;
2453
#endif
2454
0
    bits += getTUBits(ref, numIdx);
2455
2456
0
    MotionData* bestME = interMode.bestME[part];
2457
2458
    // 12 mv candidates including lowresMV
2459
0
    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2460
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2461
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, 0, pu.puAbsPartIdx);
2462
#else
2463
0
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2464
0
#endif
2465
2466
0
    const MV* amvp = interMode.amvpCand[list][ref];
2467
0
    int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
2468
0
    bool bLowresMVP = false;
2469
0
    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2470
2471
0
    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging if lowresMV is not available */
2472
0
    {
2473
0
        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
2474
0
        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2475
0
        if (lmv.notZero() && !layer)
2476
0
            mvc[numMvc++] = lmv;
2477
0
        if (m_param->bEnableHME)
2478
0
            mvp_lowres = lmv;
2479
0
    }
2480
2481
0
    m_vertRestriction = interMode.cu.m_slice->m_refPOCList[list][ref] == interMode.cu.m_slice->m_poc;
2482
0
    setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
2483
2484
0
    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2485
0
      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2486
2487
0
    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2488
0
    {
2489
0
        MV outmv_lowres;
2490
0
        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2491
0
        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2492
0
            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2493
0
        if (lowresMvCost < satdCost)
2494
0
        {
2495
0
            outmv = outmv_lowres;
2496
0
            satdCost = lowresMvCost;
2497
0
            bLowresMVP = true;
2498
0
        }
2499
0
    }
2500
    /* Get total cost of partition, but only include MV bit cost once */
2501
0
    bits += m_me.bitcost(outmv);
2502
0
    uint32_t mvCost = m_me.mvcost(outmv);
2503
0
    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2504
2505
    /* Update LowresMVP to best AMVP cand*/
2506
0
    if (bLowresMVP)
2507
0
        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2508
2509
    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2510
0
    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2511
2512
    /* tie goes to the smallest ref ID, just like --no-pme */
2513
0
    ScopedLock _lock(master.m_meLock);
2514
0
    if (cost < bestME[list].cost ||
2515
0
       (cost == bestME[list].cost && ref < bestME[list].ref))
2516
0
    {
2517
0
        bestME[list].mv = outmv;
2518
0
        bestME[list].mvp = mvp;
2519
0
        bestME[list].mvpIdx = mvpIdx;
2520
0
        bestME[list].ref = ref;
2521
0
        bestME[list].cost = cost;
2522
0
        bestME[list].bits = bits;
2523
0
        bestME[list].mvCost  = mvCost;
2524
0
    }
2525
0
}
2526
void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc)
2527
0
{
2528
0
    CUData& cu = interMode.cu;
2529
0
    MV mv, mvmin, mvmax;
2530
0
    int cand = 0, bestcost = INT_MAX;
2531
0
    while (cand < m_param->mvRefine)
2532
0
    {
2533
0
        if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1])))
2534
0
        {
2535
0
            cand++;
2536
0
            continue;
2537
0
        }
2538
0
        MV bestMV;
2539
0
        mv = mvp[cand++];
2540
0
        cu.clipMv(mv);
2541
0
        m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2542
0
        setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
2543
0
        int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices, m_vertRestriction,
2544
0
        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2545
0
        if (bestcost > cost)
2546
0
        {
2547
0
            bestcost = cost;
2548
0
            outmv = bestMV;
2549
0
        }
2550
0
    }
2551
0
}
2552
/* find the best inter prediction for each PU of specified mode */
2553
#if ENABLE_SCC_EXT
2554
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2], MV* iMVCandList)
2555
#else
2556
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
2557
#endif
2558
0
{
2559
0
    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
2560
2561
0
    CUData& cu = interMode.cu;
2562
0
    Yuv* predYuv = &interMode.predYuv;
2563
2564
    // 12 mv candidates including lowresMV
2565
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2566
2567
0
    const Slice *slice = m_slice;
2568
0
    int numPart     = cu.getNumPartInter(0);
2569
0
    int numPredDir  = slice->isInterP() ? 1 : 2;
2570
0
    const int* numRefIdx = slice->m_numRefIdx;
2571
0
    uint32_t lastMode = 0;
2572
0
    int      totalmebits = 0;
2573
0
    MV       mvzero(0, 0);
2574
0
    Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2575
0
    MergeData merge;
2576
0
    memset(&merge, 0, sizeof(merge));
2577
0
    bool useAsMVP = false;
2578
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
2579
0
    {
2580
0
        MotionData* bestME = interMode.bestME[puIdx];
2581
0
        PredictionUnit pu(cu, cuGeom, puIdx);
2582
0
        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
2583
0
        useAsMVP = false;
2584
0
        x265_analysis_inter_data* interDataCTU = NULL;
2585
0
        int cuIdx;
2586
0
        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
2587
0
        if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1)
2588
0
        {
2589
0
            interDataCTU = m_frame->m_analysisData.interData;
2590
0
            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
2591
0
                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
2592
0
                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
2593
0
                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
2594
0
                useAsMVP = true;
2595
0
        }
2596
        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
2597
0
        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
2598
0
        bestME[0].cost = MAX_UINT;
2599
0
        bestME[1].cost = MAX_UINT;
2600
2601
0
        getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
2602
0
        bool bDoUnidir = true;
2603
2604
0
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
2605
        /* Uni-directional prediction */
2606
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2607
0
            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
2608
0
        {
2609
0
            for (int list = 0; list < numPredDir; list++)
2610
0
            {
2611
2612
0
                int ref = -1;
2613
0
                if (useAsMVP)
2614
0
                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
2615
0
                else
2616
0
                    ref = bestME[list].ref;
2617
0
                if (ref < 0)
2618
0
                {
2619
0
                    continue;
2620
0
                }
2621
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2622
0
                int numIdx = m_slice->m_numRefIdx[list];
2623
#if ENABLE_SCC_EXT
2624
                if (!list && m_ibcEnabled)
2625
                    numIdx--;
2626
#endif
2627
0
                bits += getTUBits(ref, numIdx);
2628
2629
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2630
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2631
#else
2632
0
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2633
0
#endif
2634
0
                const MV* amvp = interMode.amvpCand[list][ref];
2635
0
                int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2636
0
                MV mvmin, mvmax, outmv, mvp;
2637
0
                if (useAsMVP)
2638
0
                {
2639
0
                    mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
2640
0
                    mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
2641
0
                }
2642
0
                else
2643
0
                    mvp = amvp[mvpIdx];
2644
0
                if (m_param->searchMethod == X265_SEA)
2645
0
                {
2646
0
                    int puX = puIdx & 1;
2647
0
                    int puY = puIdx >> 1;
2648
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2649
0
                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2650
0
                }
2651
0
                setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2652
0
                MV mvpIn = mvp;
2653
0
                int satdCost;
2654
0
                if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
2655
0
                    mvpIn = bestME[list].mv;
2656
0
                if (useAsMVP && m_param->mvRefine > 1)
2657
0
                {
2658
0
                    MV bestmv, mvpSel[3];
2659
0
                    int mvpIdxSel[3];
2660
0
                    satdCost = m_me.COST_MAX;
2661
0
                    mvpSel[0] = mvp;
2662
0
                    mvpIdxSel[0] = mvpIdx;
2663
0
                    mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2664
0
                    mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
2665
0
                    mvpIdxSel[1] = mvpIdx;
2666
0
                    if (m_param->mvRefine > 2)
2667
0
                    {
2668
0
                        mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
2669
0
                        mvpIdxSel[2] = !mvpIdx;
2670
0
                    }
2671
0
                    for (int cand = 0; cand < m_param->mvRefine; cand++)
2672
0
                    {
2673
0
                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
2674
0
                            continue;
2675
0
                        setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);
2676
0
                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices, m_vertRestriction,
2677
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2678
0
                        if (satdCost > bcost)
2679
0
                        {
2680
0
                            satdCost = bcost;
2681
0
                            outmv = bestmv;
2682
0
                            mvp = mvpSel[cand];
2683
0
                            mvpIdx = mvpIdxSel[cand];
2684
0
                        }
2685
0
                    }
2686
0
                    mvpIn = mvp;
2687
0
                }
2688
0
                else
2689
0
                {
2690
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2691
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2692
0
                }
2693
2694
                /* Get total cost of partition, but only include MV bit cost once */
2695
0
                bits += m_me.bitcost(outmv);
2696
0
                uint32_t mvCost = m_me.mvcost(outmv);
2697
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2698
                /* Refine MVP selection, updates: mvpIdx, bits, cost */
2699
0
                if (!(m_param->analysisMultiPassRefine || useAsMVP))
2700
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2701
0
                else
2702
0
                {
2703
                    /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here 
2704
                      the actual mvp is bestME from pass 1 for that mvpIdx */
2705
0
                    int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
2706
0
                    if (diffBits < 0)
2707
0
                    {
2708
0
                        mvpIdx = !mvpIdx;
2709
0
                        uint32_t origOutBits = bits;
2710
0
                        bits = origOutBits + diffBits;
2711
0
                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
2712
0
                    }
2713
0
                    mvp = amvp[mvpIdx];
2714
0
                }
2715
2716
0
                if (cost < bestME[list].cost)
2717
0
                {
2718
0
                    bestME[list].mv = outmv;
2719
0
                    bestME[list].mvp = mvp;
2720
0
                    bestME[list].mvpIdx = mvpIdx;
2721
0
                    bestME[list].cost = cost;
2722
0
                    bestME[list].bits = bits;
2723
0
                    bestME[list].mvCost  = mvCost;
2724
0
                    bestME[list].ref = ref;
2725
0
                }
2726
0
                bDoUnidir = false;
2727
0
            }            
2728
0
        }
2729
0
        else if (m_param->bDistributeMotionEstimation)
2730
0
        {
2731
0
            PME pme(*this, interMode, cuGeom, pu, puIdx);
2732
0
            pme.m_jobTotal = 0;
2733
0
            pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
2734
2735
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2736
0
            for (int list = 0; list < numPredDir; list++)
2737
0
            {
2738
0
                int idx = 0;
2739
0
                int numIdx = numRefIdx[list];
2740
#if ENABLE_SCC_EXT
2741
                if (!list && m_ibcEnabled)
2742
                    numIdx--;
2743
#endif
2744
0
                for (int ref = 0; ref < numIdx; ref++)
2745
0
                {
2746
0
                    if (!(refMask & (1 << ref)))
2747
0
                        continue;
2748
2749
0
                    pme.m_jobs.ref[list][idx++]  = ref;
2750
0
                    pme.m_jobTotal++;
2751
0
                }
2752
0
                pme.m_jobs.refCnt[list] = idx;
2753
2754
                /* the second list ref bits start at bit 16 */
2755
0
                refMask >>= 16;
2756
0
            }
2757
2758
0
            if (pme.m_jobTotal > 2)
2759
0
            {
2760
0
                pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
2761
2762
0
                processPME(pme, *this);
2763
2764
0
                int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
2765
0
                singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
2766
2767
0
                bDoUnidir = false;
2768
2769
0
                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
2770
0
                pme.waitForExit();
2771
0
            }
2772
2773
            /* if no peer threads were bonded, fall back to doing unidirectional
2774
             * searches ourselves without overhead of singleMotionEstimation() */
2775
0
        }
2776
0
        if (bDoUnidir && !m_param->bThreadedME)
2777
0
        {
2778
0
            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
2779
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2780
2781
0
            for (int list = 0; list < numPredDir; list++)
2782
0
            {
2783
0
                int numIdx = numRefIdx[list];
2784
#if ENABLE_SCC_EXT
2785
                if (!list && m_ibcEnabled)
2786
                    numIdx--;
2787
#endif
2788
0
                for (int ref = 0; ref < numIdx; ref++)
2789
0
                {
2790
0
                    ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
2791
2792
0
                    if (!(refMask & (1 << ref)))
2793
0
                    {
2794
0
                        ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
2795
0
                        continue;
2796
0
                    }
2797
2798
0
                    uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2799
0
                    bits += getTUBits(ref, numIdx);
2800
2801
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2802
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2803
#else
2804
0
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2805
0
#endif
2806
2807
0
                    const MV* amvp = interMode.amvpCand[list][ref];
2808
0
                    int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2809
0
                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2810
0
                    bool bLowresMVP = false;
2811
2812
0
                    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging when lowresMV is not available */
2813
0
                    {
2814
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
2815
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2816
0
                        if (lmv.notZero() && !layer)
2817
0
                            mvc[numMvc++] = lmv;
2818
0
                        if (m_param->bEnableHME)
2819
0
                            mvp_lowres = lmv;
2820
0
                    }
2821
0
                    if (m_param->searchMethod == X265_SEA)
2822
0
                    {
2823
0
                        int puX = puIdx & 1;
2824
0
                        int puY = puIdx >> 1;
2825
0
                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2826
0
                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2827
0
                    }
2828
0
                    m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2829
0
                    setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2830
0
                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2831
0
                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2832
2833
0
                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2834
0
                    {
2835
0
                        MV outmv_lowres;
2836
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2837
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2838
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2839
0
                        if (lowresMvCost < satdCost)
2840
0
                        {
2841
0
                            outmv = outmv_lowres;
2842
0
                            satdCost = lowresMvCost;
2843
0
                            bLowresMVP = true;
2844
0
                        }
2845
0
                    }
2846
2847
                    /* Get total cost of partition, but only include MV bit cost once */
2848
0
                    bits += m_me.bitcost(outmv);
2849
0
                    uint32_t mvCost = m_me.mvcost(outmv);
2850
0
                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2851
                    /* Update LowresMVP to best AMVP cand*/
2852
0
                    if (bLowresMVP)
2853
0
                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2854
2855
                    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2856
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2857
2858
#if ENABLE_SCC_EXT
2859
                    if (m_param->bEnableSCC && (list <= 1 && ref <= 1 && (cu.m_partSize[0] == SIZE_2NxN || cu.m_partSize[0] == SIZE_Nx2N) && (1 << cu.m_log2CUSize[0]) <= 16))
2860
                    {
2861
                        iMVCandList[4 * list + 2 * ref + puIdx] = outmv;
2862
                    }
2863
#endif
2864
2865
0
                    if (cost < bestME[list].cost)
2866
0
                    {
2867
0
                        bestME[list].mv      = outmv;
2868
0
                        bestME[list].mvp     = mvp;
2869
0
                        bestME[list].mvpIdx  = mvpIdx;
2870
0
                        bestME[list].ref     = ref;
2871
0
                        bestME[list].cost    = cost;
2872
0
                        bestME[list].bits    = bits;
2873
0
                        bestME[list].mvCost  = mvCost;
2874
0
                    }
2875
0
                }
2876
                /* the second list ref bits start at bit 16 */
2877
0
                refMask >>= 16;
2878
0
            }
2879
0
        }
2880
2881
        /* Bi-directional prediction */
2882
0
        MotionData bidir[2];
2883
0
        uint32_t bidirCost = MAX_UINT;
2884
0
        int bidirBits = 0;
2885
2886
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
2887
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
2888
0
            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && !m_param->bThreadedME)
2889
0
        {
2890
0
            bidir[0] = bestME[0];
2891
0
            bidir[1] = bestME[1];
2892
2893
0
            int satdCost;
2894
2895
0
            if (m_me.bChromaSATD)
2896
0
            {
2897
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
2898
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2899
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
2900
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2901
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
2902
2903
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2904
0
                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2905
0
            }
2906
0
            else
2907
0
            {
2908
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
2909
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
2910
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2911
2912
                /* Generate reference subpels */
2913
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
2914
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
2915
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
2916
0
                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
2917
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2918
0
            }
2919
2920
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2921
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2922
2923
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2924
0
            if (bTryZero)
2925
0
            {
2926
                /* Do not try zero MV if unidir motion predictors are beyond
2927
                 * valid search area */
2928
0
                MV mvmin, mvmax;
2929
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2930
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2931
0
                mvmax.y += 2; // there is some pad for subpel refine
2932
0
                mvmin <<= 2;
2933
0
                mvmax <<= 2;
2934
2935
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2936
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2937
0
            }
2938
0
            if (bTryZero)
2939
0
            {
2940
                /* coincident blocks of the two reference pictures */
2941
0
                if (m_me.bChromaSATD)
2942
0
                {
2943
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
2944
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2945
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
2946
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2947
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
2948
2949
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2950
0
                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2951
0
                }
2952
0
                else
2953
0
                {
2954
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2955
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
2956
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
2957
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
2958
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2959
0
                }
2960
0
                MV mvp0 = bestME[0].mvp;
2961
0
                int mvpIdx0 = bestME[0].mvpIdx;
2962
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
2963
2964
0
                MV mvp1 = bestME[1].mvp;
2965
0
                int mvpIdx1 = bestME[1].mvpIdx;
2966
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
2967
2968
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
2969
2970
                /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2971
0
                mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
2972
0
                mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
2973
2974
0
                if (cost < bidirCost)
2975
0
                {
2976
0
                    bidir[0].mv = mvzero;
2977
0
                    bidir[1].mv = mvzero;
2978
0
                    bidir[0].mvp = mvp0;
2979
0
                    bidir[1].mvp = mvp1;
2980
0
                    bidir[0].mvpIdx = mvpIdx0;
2981
0
                    bidir[1].mvpIdx = mvpIdx1;
2982
0
                    bidirCost = cost;
2983
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2984
0
                }
2985
0
            }
2986
0
        }
2987
2988
0
        uint32_t bestCost = MAX_INT;
2989
0
        bool isMerge = false;
2990
0
        bool isBidir = false;
2991
0
        bool uniL0 = false;
2992
0
        bool uniL1 = false;
2993
2994
0
        if (m_param->bThreadedME)
2995
0
        {
2996
0
            int cuSize = 1 << cu.m_log2CUSize[0];
2997
2998
0
            int lookupWidth = pu.width;
2999
0
            int lookupHeight = pu.height;
3000
3001
0
            bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU;
3002
3003
0
            if (isAmp)
3004
0
            {
3005
0
                if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == SIZE_2NxnD)
3006
0
                    lookupHeight = (puIdx) ? (pu.width - pu.height) : pu.height;
3007
0
                else
3008
0
                    lookupWidth = (puIdx) ? (pu.height - pu.width) : pu.width;
3009
0
            }
3010
3011
0
            int startIdx = g_puStartIdx[lookupWidth + lookupHeight][static_cast<int>(cu.m_partSize[0])];
3012
3013
0
            int alignWidth = isAmp ? cuSize : pu.width;
3014
0
            int alignHeight = isAmp ? cuSize : pu.height;
3015
3016
0
            int numPUX = m_param->maxCUSize / alignWidth;
3017
0
            int numPUY = m_param->maxCUSize / alignHeight;
3018
3019
0
            int puOffset = isAmp ? (puIdx * numPUX * numPUY) : (cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx);
3020
 
3021
0
            int relX = (cu.m_cuPelX / alignWidth) % numPUX;
3022
0
            int relY = (cu.m_cuPelY / alignHeight) % numPUY;
3023
3024
0
            int index = startIdx + (relY * numPUX + relX) + puOffset;
3025
3026
0
            int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
3027
0
            int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
3028
3029
0
            int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row;
3030
3031
0
            MEData meData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index];
3032
3033
0
            bestME[0].ref = meData.ref[0];
3034
0
            bestME[1].ref = meData.ref[1];
3035
3036
0
            isBidir = (bestME[0].ref >= 0 && bestME[1].ref >= 0);
3037
0
            uniL0 = (bestME[0].ref >= 0 && bestME[1].ref == REF_NOT_VALID);
3038
0
            uniL1 = (bestME[1].ref >= 0 && bestME[0].ref == REF_NOT_VALID);
3039
3040
0
            if(isBidir)
3041
0
            {
3042
0
                cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc);
3043
0
                cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc);
3044
3045
0
                bidir[0].mv = meData.mv[0];
3046
0
                bidir[1].mv = meData.mv[1];
3047
0
                bidir[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
3048
0
                bidir[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
3049
0
                bidir[0].mvCost = meData.mvCost[0];
3050
0
                bidir[1].mvCost = meData.mvCost[1];
3051
0
                bidirCost = meData.cost;
3052
0
                bidirBits = meData.bits;
3053
3054
0
                bestCost = bidirCost;
3055
0
            }
3056
0
            else if (uniL0)
3057
0
            {
3058
0
                cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc);
3059
3060
0
                bestME[0].mv = meData.mv[0];
3061
0
                bestME[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
3062
0
                bestME[0].mvCost = meData.mvCost[0];
3063
0
                bestME[0].cost = meData.cost;
3064
0
                bestME[0].bits = meData.bits;
3065
3066
0
                bestCost = bestME[0].cost;
3067
0
            }
3068
0
            else if (uniL1)
3069
0
            {
3070
0
                cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc);
3071
3072
0
                bestME[1].mv = meData.mv[1];
3073
0
                bestME[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
3074
0
                bestME[1].mvCost = meData.mvCost[1];
3075
0
                bestME[1].cost = meData.cost;
3076
0
                bestME[1].bits = meData.bits;
3077
3078
0
                bestCost = bestME[1].cost;
3079
0
            }
3080
0
            else
3081
0
                x265_log(NULL, X265_LOG_ERROR, "Invalid ME mode");
3082
3083
0
            if (mrgCost < bestCost)
3084
0
                isMerge = true;
3085
0
        }
3086
3087
        /* select best option and store into CU */
3088
0
        if ((mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) || isMerge)
3089
0
        {
3090
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
3091
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
3092
0
            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
3093
0
            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
3094
0
            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
3095
0
            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
3096
0
            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
3097
3098
0
            totalmebits += merge.bits;
3099
0
        }
3100
0
        else if ((bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) || isBidir)
3101
0
        {
3102
0
            lastMode = 2;
3103
3104
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3105
0
            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
3106
0
            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
3107
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
3108
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
3109
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
3110
3111
0
            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
3112
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
3113
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
3114
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
3115
3116
0
            totalmebits += bidirBits;
3117
0
        }
3118
0
        else if ((bestME[0].cost <= bestME[1].cost) || uniL0)
3119
0
        {
3120
0
            lastMode = 0;
3121
3122
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3123
0
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3124
0
            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
3125
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
3126
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
3127
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
3128
3129
0
            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3130
0
            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
3131
3132
0
            totalmebits += bestME[0].bits;
3133
0
        }
3134
0
        else
3135
0
        {
3136
0
            lastMode = 1;
3137
3138
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3139
0
            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
3140
0
            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
3141
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
3142
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
3143
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
3144
3145
0
            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3146
0
            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
3147
3148
0
            totalmebits += bestME[1].bits;
3149
0
        }
3150
3151
0
        motionCompensation(cu, pu, *predYuv, true, bChromaMC);
3152
0
    }
3153
0
    interMode.sa8dBits += totalmebits;
3154
0
}
3155
3156
#if ENABLE_SCC_EXT
3157
uint32_t Search::getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height)
3158
{
3159
    uint32_t dist = 0;
3160
3161
    for (int i = 0; i < height; i++)
3162
    {
3163
        for (int j = 0; j < width; j++)
3164
        {
3165
            dist += abs(ref[j] - curr[j]);
3166
        }
3167
        ref += refStride;
3168
        curr += currStride;
3169
    }
3170
    return dist;
3171
}
3172
3173
int Search::intraBCSearchMVChromaRefine(Mode& intraBCMode,
3174
    const CUGeom& cuGeom,
3175
    int         roiWidth,
3176
    int         roiHeight,
3177
    int         cuPelX,
3178
    int         cuPelY,
3179
    uint32_t* sadBestCand,
3180
    MV* MVCand,
3181
    uint32_t    partOffset,
3182
    int         puIdx
3183
)
3184
{
3185
    int bestCandIdx = 0;
3186
    uint32_t  sadBest = UINT_MAX;
3187
    uint32_t  tempSad;
3188
3189
    pixel* ref;
3190
    const pixel* picOrg;
3191
    int refStride, orgStride;
3192
    int width, height;
3193
3194
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3195
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3196
3197
    CUData& cu = intraBCMode.cu;
3198
    Yuv& tmpPredYuv = intraBCMode.predYuv;
3199
    PredictionUnit pu(cu, cuGeom, puIdx);
3200
3201
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3202
    {
3203
        if ((!MVCand[cand].x) && (!MVCand[cand].y))
3204
        {
3205
            continue;
3206
        }
3207
3208
        if (((int)(cuPelY + MVCand[cand].y + roiHeight) >= picHeight) || ((cuPelY + MVCand[cand].y) < 0))
3209
        {
3210
            continue;
3211
        }
3212
3213
        if (((int)(cuPelX + MVCand[cand].x + roiWidth) >= picWidth) || ((cuPelX + MVCand[cand].x) < 0))
3214
        {
3215
            continue;
3216
        }
3217
3218
        tempSad = sadBestCand[cand];
3219
        int bitDepths = m_param->sourceBitDepth;
3220
        MV mvQuaterPixl = MVCand[cand];
3221
        mvQuaterPixl <<= 2;
3222
        cu.setPUMv(0, mvQuaterPixl, pu.puAbsPartIdx, puIdx);
3223
        cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
3224
        cu.setPUMv(1, MV(), pu.puAbsPartIdx, puIdx);
3225
        cu.setPURefIdx(1, -1, pu.puAbsPartIdx, puIdx);
3226
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3227
3228
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
3229
3230
        for (uint32_t ch = TEXT_CHROMA_U; ch < MAX_NUM_COMPONENT; ch++)
3231
        {
3232
            ref = m_slice->m_refFrameList[0][m_slice->m_numRefIdx[0] - 1]->m_reconPic[1]->getChromaAddr(ch, cu.m_cuAddr, cu.m_absIdxInCTU + partOffset);
3233
3234
            picOrg = intraBCMode.fencYuv->getChromaAddr(ch, partOffset);
3235
            orgStride = intraBCMode.fencYuv->m_csize;
3236
3237
            refStride = m_frame->m_reconPic[1]->m_strideC;
3238
3239
            width = roiWidth >> m_hChromaShift;
3240
            height = roiHeight >> m_vChromaShift;
3241
3242
            ref = tmpPredYuv.getChromaAddr(ch, partOffset);
3243
            refStride = tmpPredYuv.m_csize;
3244
3245
            for (int row = 0; row < height; row++)
3246
            {
3247
                for (int col = 0; col < width; col++)
3248
                {
3249
                    tempSad += ((abs(ref[col] - picOrg[col])) >> (bitDepths - 8));
3250
                }
3251
                ref += refStride;
3252
                picOrg += orgStride;
3253
            }
3254
        }
3255
3256
        if (tempSad < sadBest)
3257
        {
3258
            sadBest = tempSad;
3259
            bestCandIdx = cand;
3260
        }
3261
    }
3262
3263
    return bestCandIdx;
3264
}
3265
3266
void Search::updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc)
3267
{
3268
    if (roiWidth + roiHeight > 8)
3269
    {
3270
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvCand, CHROMA_REFINEMENT_CANDIDATES, false);
3271
3272
        if (roiWidth + roiHeight == 32)
3273
        {
3274
            ibc.m_numBV16s = ibc.m_numBVs;
3275
        }
3276
    }
3277
}
3278
3279
void Search::intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* MVCand)
3280
{
3281
    int j = CHROMA_REFINEMENT_CANDIDATES - 1;
3282
3283
    if (sad < sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3284
    {
3285
        for (int t = CHROMA_REFINEMENT_CANDIDATES - 1; t >= 0; t--)
3286
        {
3287
            if (sad < sadBestCand[t])
3288
            {
3289
                j = t;
3290
            }
3291
        }
3292
3293
        for (int k = CHROMA_REFINEMENT_CANDIDATES - 1; k > j; k--)
3294
        {
3295
            sadBestCand[k] = sadBestCand[k - 1];
3296
3297
            MVCand[k].set(MVCand[k - 1].x, MVCand[k - 1].y);
3298
        }
3299
        sadBestCand[j] = sad;
3300
        MVCand[j].set(x, y);
3301
    }
3302
}
3303
3304
uint32_t Search::mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel)
3305
{
3306
    for (uint32_t cand = 0; cand < sn && dn < SCM_S0067_NUM_CANDIDATES; cand++)
3307
    {
3308
        bool found = false;
3309
        MV TempMv = src[cand];
3310
        if (!isSrcQuarPel)
3311
        {
3312
            TempMv <<= 2;
3313
        }
3314
        for (uint32_t j = 0; j < dn; j++)
3315
        {
3316
            if (TempMv == dst[j])
3317
            {
3318
                found = true;
3319
                break;
3320
            }
3321
        }
3322
3323
        if (!found)
3324
        {
3325
            dst[dn] = TempMv;
3326
            dn++;
3327
        }
3328
    }
3329
    return dn;
3330
}
3331
3332
void Search::restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours, uint32_t numValidMergeCand)
3333
{
3334
    {
3335
        for (uint32_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
3336
        {
3337
            if (interDirNeighbours[mergeCand] == 3)
3338
            {
3339
                bool b8x8BiPredRestricted = cu->is8x8BipredRestriction(
3340
                    mvFieldNeighbours[mergeCand][0].mv,
3341
                    mvFieldNeighbours[mergeCand][1].mv,
3342
                    mvFieldNeighbours[mergeCand][0].refIdx,
3343
                    mvFieldNeighbours[mergeCand][1].refIdx);
3344
3345
                int width = 0;
3346
                int height = 0;
3347
                uint32_t partAddr;
3348
3349
                cu->getPartIndexAndSize(puIdx, partAddr, width, height);
3350
                if (b8x8BiPredRestricted)
3351
                {
3352
                    if (width <= 8 && height <= 8)
3353
                    {
3354
                        interDirNeighbours[mergeCand] = 1;
3355
                        mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
3356
                    }
3357
                }
3358
                else if (cu->isBipredRestriction())
3359
                {
3360
                    interDirNeighbours[mergeCand] = 1;
3361
                    mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
3362
                }
3363
            }
3364
        }
3365
    }
3366
}
3367
3368
bool Search::isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* cu,
3369
    int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize)
3370
{
3371
    static const int s_floorLog2[65] =
3372
    {
3373
      -1, 0, 1, 1, 2, 2, 2, 2, 3, 3,
3374
       3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
3375
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
3376
       4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
3377
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3378
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3379
       5, 5, 5, 5, 6
3380
    };
3381
3382
    int ctuSizeLog2 = s_floorLog2[ctuSize];
3383
    int interpolationSamplesX = (cu->m_chromaFormat == X265_CSP_I422 || cu->m_chromaFormat == X265_CSP_I420) ? ((xBv & 0x1) << 1) : 0;
3384
    int interpolationSamplesY = (cu->m_chromaFormat == X265_CSP_I420) ? ((yBv & 0x1) << 1) : 0;
3385
    int refRightX = xPos + xBv + width - 1 + interpolationSamplesX;
3386
    int refBottomY = yPos + yBv + height - 1 + interpolationSamplesY;
3387
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3388
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3389
3390
    if ((xPos + xBv - interpolationSamplesX) < 0)
3391
        return false;
3392
    if (refRightX >= picWidth)
3393
        return false;
3394
    if ((yPos + yBv - interpolationSamplesY) < 0)
3395
        return false;
3396
    if (refBottomY >= picHeight)
3397
        return false;
3398
3399
    if ((xBv + width + interpolationSamplesX) > 0 && (yBv + height + interpolationSamplesY) > 0)
3400
        return false;
3401
3402
    if (refBottomY >> ctuSizeLog2 < yPos >> ctuSizeLog2)
3403
    {
3404
        int refCuX = refRightX / ctuSize;
3405
        int refCuY = refBottomY / ctuSize;
3406
        int cuPelX = xPos / ctuSize;
3407
        int cuPelY = yPos / ctuSize;
3408
3409
        if (((int)(refCuX - cuPelX) > (int)((cuPelY - refCuY))))
3410
            return false;
3411
        else
3412
            return true;
3413
    }
3414
3415
    if (refBottomY >> ctuSizeLog2 > yPos >> ctuSizeLog2)
3416
    {
3417
        return false;
3418
    }
3419
3420
    // in the same CTU line
3421
    if (refRightX >> ctuSizeLog2 < xPos >> ctuSizeLog2)
3422
        return true;
3423
    if (refRightX >> ctuSizeLog2 > xPos >> ctuSizeLog2)
3424
        return false;
3425
3426
    // same CTU
3427
    int mask = 1 << ctuSizeLog2;
3428
    mask -= 1;
3429
    int rasterCurr = ((((yPos & mask) - yStartInCU) >> 2) << (ctuSizeLog2 - 2)) + (((xPos & mask) - xStartInCU) >> 2);
3430
    int rasterRef = (((refBottomY & mask) >> 2) << (ctuSizeLog2 - 2)) + ((refRightX & mask) >> 2);
3431
3432
    if (g_rasterToZscan[rasterRef] >= g_rasterToZscan[rasterCurr])
3433
        return false;
3434
    return true;
3435
}
3436
3437
bool Search::isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset)
3438
{
3439
    const int  cuPelX = cu->m_cuPelX + g_zscanToPelX[partOffset];
3440
    const int  cuPelY = cu->m_cuPelY + g_zscanToPelY[partOffset];
3441
3442
    if (!isBlockVectorValid(cuPelX, cuPelY, roiWidth, roiHeight, cu, g_zscanToPelX[partOffset], g_zscanToPelY[partOffset], predX, predY, m_param->maxCUSize))
3443
    {
3444
        return false;
3445
    }
3446
    return true;
3447
}
3448
3449
void Search::intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
3450
    MV& mv, uint32_t& cost, int roiWidth, int roiHeight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3451
{
3452
    const int   srchRngHorLeft = searchRangeLT->x;
3453
    const int   srchRngHorRight = searchRangeRB->x;
3454
    const int   srchRngVerTop = searchRangeLT->y;
3455
    const int   srchRngVerBottom = searchRangeRB->y;
3456
3457
    CUData& cu = intraBCMode.cu;
3458
    const uint32_t  lcuWidth = m_param->maxCUSize;
3459
    const uint32_t  lcuHeight = m_param->maxCUSize;
3460
    const int       puPelOffsetX = g_zscanToPelX[partAddr];
3461
    const int       puPelOffsetY = g_zscanToPelY[partAddr];
3462
    const int       cuPelX = cu.m_cuPelX + puPelOffsetX;  // Point to the location of PU
3463
    const int       cuPelY = cu.m_cuPelY + puPelOffsetY;
3464
3465
    uint32_t  sad = 0;
3466
    uint32_t  sadBest = UINT_MAX;
3467
    int         bestX = 0;
3468
    int         bestY = 0;
3469
    pixel* refSrch;
3470
3471
    int         bestCandIdx = 0;
3472
    uint32_t    partOffset = 0;
3473
    MV          MVCand[CHROMA_REFINEMENT_CANDIDATES];
3474
    uint32_t    sadBestCand[CHROMA_REFINEMENT_CANDIDATES];
3475
3476
    partOffset = partAddr;
3477
    PredictionUnit pu(cu, cuGeom, puIdx);
3478
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3479
    {
3480
        sadBestCand[cand] = UINT_MAX;
3481
        MVCand[cand].set(0, 0);
3482
    }
3483
3484
    const int         relCUPelX = cuPelX % lcuWidth;
3485
    const int         relCUPelY = cuPelY % lcuHeight;
3486
    const int chromaROIWidthInPixels = roiWidth;
3487
    const int chromaROIHeightInPixels = roiHeight;
3488
    bool fastsearch = (m_param->bEnableSCC == 1) ? true : false;
3489
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3490
3491
    if (fastsearch)
3492
    {
3493
        uint32_t tempSadBest = 0;
3494
        int srLeft = srchRngHorLeft, srRight = srchRngHorRight, srTop = srchRngVerTop, srBottom = srchRngVerBottom;
3495
        const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3496
        const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3497
3498
        if (isFullFrameSearchrangeEnabled)//full frame search
3499
        {
3500
            srLeft = -1 * cuPelX;
3501
            srTop = -1 * cuPelY;
3502
3503
            srRight = picWidth - cuPelX - roiWidth;
3504
            srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3505
3506
            if (cuPelX + srRight + roiWidth > (int)picWidth)
3507
            {
3508
                srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3509
            }
3510
            if (cuPelY + srBottom + roiHeight > (int)picHeight)
3511
            {
3512
                srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3513
            }
3514
        }
3515
3516
        if (roiWidth > 8 || roiHeight > 8)
3517
            ibc.m_numBVs = 0;
3518
        else if (roiWidth + roiHeight == 16)
3519
            ibc.m_numBVs = ibc.m_numBV16s;
3520
        if (testOnlyPred)
3521
            ibc.m_numBVs = 0;
3522
3523
        MV  mvPredEncOnly[16];
3524
        int nbPreds = 0;
3525
        cu.getIntraBCMVPsEncOnly(partAddr, mvPredEncOnly, nbPreds, puIdx);
3526
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvPredEncOnly, nbPreds, true);
3527
3528
        for (int cand = 0; cand < ibc.m_numBVs; cand++)
3529
        {
3530
            int xPred = ibc.m_BVs[cand].x >> 2;
3531
            int yPred = ibc.m_BVs[cand].y >> 2;
3532
            if (!(xPred == 0 && yPred == 0) && !((yPred < srTop) || (yPred > srBottom)) && !((xPred < srLeft) || (xPred > srRight)))
3533
            {
3534
                int tempY = yPred + relCUPelY + roiHeight - 1;
3535
                int tempX = xPred + relCUPelX + roiWidth - 1;
3536
                bool validCand = isValidIntraBCSearchArea(&cu, xPred, yPred, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset);
3537
3538
                if ((tempX >= (int)lcuWidth) && (tempY >= 0) && isFullFrameSearchrangeEnabled)
3539
                    validCand = false;
3540
3541
                if ((tempX >= 0) && (tempY >= 0))
3542
                {
3543
                    int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3544
                    uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3545
                    if (tempZscanIdx >= cu.m_absIdxInCTU)
3546
                    {
3547
                        validCand = false;
3548
                    }
3549
                }
3550
3551
                if (validCand)
3552
                {
3553
                    sad = m_me.mvcost(ibc.m_BVs[cand]);
3554
3555
                    refSrch = refY + yPred * refStride + xPred;
3556
3557
                    sad += m_me.bufSAD(refSrch, refStride);
3558
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3559
                    {
3560
                        continue;
3561
                    }
3562
3563
                    intraBCSearchMVCandUpdate(sad, xPred, yPred, sadBestCand, MVCand);
3564
                }
3565
            }
3566
        }
3567
        bestX = MVCand[0].x;
3568
        bestY = MVCand[0].y;
3569
        mv.set(bestX, bestY);
3570
        sadBest = sadBestCand[0];
3571
3572
        if (testOnlyPred)
3573
        {
3574
            cost = sadBest;
3575
            return;
3576
        }
3577
3578
        const int boundY = (0 - roiHeight - puPelOffsetY);
3579
        int lowY = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3580
            ? -cuPelY : X265_MAX(srchRngVerTop, 0 - cuPelY);
3581
        for (int y = boundY; y >= lowY; y--)
3582
        {
3583
            if (!isValidIntraBCSearchArea(&cu, 0, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3584
            {
3585
                continue;
3586
            }
3587
3588
            sad = m_me.mvcost(MV(0, y));
3589
3590
            refSrch = refY + y * refStride;
3591
3592
            sad += m_me.bufSAD(refSrch, refStride);
3593
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3594
            {
3595
                continue;
3596
            }
3597
3598
            intraBCSearchMVCandUpdate(sad, 0, y, sadBestCand, MVCand);
3599
            tempSadBest = sadBestCand[0];
3600
            if (sadBestCand[0] <= 3)
3601
            {
3602
                bestX = MVCand[0].x;
3603
                bestY = MVCand[0].y;
3604
                sadBest = sadBestCand[0];
3605
                mv.set(bestX, bestY);
3606
                cost = sadBest;
3607
3608
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3609
                return;
3610
            }
3611
        }
3612
3613
        const int boundX = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3614
            ? -cuPelX : X265_MAX(srchRngHorLeft, -cuPelX);
3615
        for (int x = 0 - roiWidth - puPelOffsetX; x >= boundX; --x)
3616
        {
3617
            if (!isValidIntraBCSearchArea(&cu, x, 0, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3618
            {
3619
                continue;
3620
            }
3621
3622
            sad = m_me.mvcost(MV(x, 0));
3623
3624
            refSrch = refY + x;
3625
            sad += m_me.bufSAD(refSrch, refStride);
3626
3627
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3628
            {
3629
                continue;
3630
            }
3631
3632
            intraBCSearchMVCandUpdate(sad, x, 0, sadBestCand, MVCand);
3633
            tempSadBest = sadBestCand[0];
3634
            if (sadBestCand[0] <= 3)
3635
            {
3636
                bestX = MVCand[0].x;
3637
                bestY = MVCand[0].y;
3638
                sadBest = sadBestCand[0];
3639
                mv.set(bestX, bestY);
3640
                cost = sadBest;
3641
3642
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3643
                return;
3644
            }
3645
        }
3646
3647
        bestX = MVCand[0].x;
3648
        bestY = MVCand[0].y;
3649
        sadBest = sadBestCand[0];
3650
3651
        if ((!bestX && !bestY) || (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 32))
3652
        {
3653
            //chroma refine
3654
            bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3655
            bestX = MVCand[bestCandIdx].x;
3656
            bestY = MVCand[bestCandIdx].y;
3657
            sadBest = sadBestCand[bestCandIdx];
3658
            mv.set(bestX, bestY);
3659
            cost = sadBest;
3660
3661
            updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3662
            return;
3663
        }
3664
3665
        if (cuGeom.depth > 2 && !bUse1DSearchFor8x8)
3666
        {
3667
            for (int y = X265_MAX(srchRngVerTop, -cuPelY); y <= srchRngVerBottom; y += 2)
3668
            {
3669
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3670
                {
3671
                    continue;
3672
                }
3673
3674
                int tempY = y + relCUPelY + roiHeight - 1;
3675
3676
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x++)
3677
                {
3678
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3679
                    {
3680
                        continue;
3681
                    }
3682
3683
                    int tempX = x + relCUPelX + roiWidth - 1;
3684
3685
                    if ((tempX >= 0) && (tempY >= 0))
3686
                    {
3687
                        int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3688
                        uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3689
                        if (iTempZscanIdx >= cu.m_absIdxInCTU)
3690
                        {
3691
                            continue;
3692
                        }
3693
                    }
3694
3695
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3696
                    {
3697
                        continue;
3698
                    }
3699
3700
                    sad = m_me.mvcost(MV(x, y));
3701
3702
                    refSrch = refY + y * refStride + x;
3703
                    sad += m_me.bufSAD(refSrch, refStride);
3704
3705
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3706
                }
3707
            }
3708
3709
            bestX = MVCand[0].x;
3710
            bestY = MVCand[0].y;
3711
            sadBest = sadBestCand[0];
3712
            if (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 16)
3713
            {
3714
                //chroma refine
3715
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3716
                bestX = MVCand[bestCandIdx].x;
3717
                bestY = MVCand[bestCandIdx].y;
3718
                sadBest = sadBestCand[bestCandIdx];
3719
                mv.set(bestX, bestY);
3720
                cost = sadBest;
3721
3722
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3723
                return;
3724
            }
3725
3726
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3727
            {
3728
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3729
                {
3730
                    continue;
3731
                }
3732
3733
                int tempY = y + relCUPelY + roiHeight - 1;
3734
3735
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x += 2)
3736
                {
3737
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3738
                    {
3739
                        continue;
3740
                    }
3741
3742
                    int tempX = x + relCUPelX + roiWidth - 1;
3743
3744
                    if ((tempX >= 0) && (tempY >= 0))
3745
                    {
3746
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3747
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3748
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3749
                        {
3750
                            continue;
3751
                        }
3752
                    }
3753
3754
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3755
                    {
3756
                        continue;
3757
                    }
3758
3759
                    sad = m_me.mvcost(MV(x, y));
3760
3761
                    refSrch = refY + y * refStride + x;
3762
                    sad += m_me.bufSAD(refSrch, refStride);
3763
3764
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3765
                    {
3766
                        continue;
3767
                    }
3768
3769
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3770
                    if (sadBestCand[0] <= 5)
3771
                    {
3772
                        //chroma refine & return
3773
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3774
                        bestX = MVCand[bestCandIdx].x;
3775
                        bestY = MVCand[bestCandIdx].y;
3776
                        sadBest = sadBestCand[bestCandIdx];
3777
                        mv.set(bestX, bestY);
3778
                        cost = sadBest;
3779
3780
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3781
                        return;
3782
                    }
3783
                }
3784
            }
3785
3786
            bestX = MVCand[0].x;
3787
            bestY = MVCand[0].y;
3788
            sadBest = sadBestCand[0];
3789
3790
            if ((sadBest >= tempSadBest) || ((sadBest - m_me.mvcost(MV(bestX, bestY))) <= 32))
3791
            {
3792
                //chroma refine
3793
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3794
                bestX = MVCand[bestCandIdx].x;
3795
                bestY = MVCand[bestCandIdx].y;
3796
                sadBest = sadBestCand[bestCandIdx];
3797
                mv.set(bestX, bestY);
3798
                cost = sadBest;
3799
3800
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3801
                return;
3802
            }
3803
3804
            tempSadBest = sadBestCand[0];
3805
3806
3807
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3808
            {
3809
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3810
                {
3811
                    continue;
3812
                }
3813
3814
                int tempY = y + relCUPelY + roiHeight - 1;
3815
3816
                for (int x = (X265_MAX(srchRngHorLeft, -cuPelX) + 1); x <= srchRngHorRight; x += 2)
3817
                {
3818
3819
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3820
                    {
3821
                        continue;
3822
                    }
3823
3824
                    int tempX = x + relCUPelX + roiWidth - 1;
3825
3826
                    if ((tempX >= 0) && (tempY >= 0))
3827
                    {
3828
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3829
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3830
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3831
                        {
3832
                            continue;
3833
                        }
3834
                    }
3835
3836
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3837
                    {
3838
                        continue;
3839
                    }
3840
3841
                    sad = m_me.mvcost(MV(x, y));
3842
3843
                    refSrch = refY + y * refStride + x;
3844
                    sad += m_me.bufSAD(refSrch, refStride);
3845
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3846
                    {
3847
                        continue;
3848
                    }
3849
3850
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3851
                    if (sadBestCand[0] <= 5)
3852
                    {
3853
                        //chroma refine & return
3854
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3855
                        bestX = MVCand[bestCandIdx].x;
3856
                        bestY = MVCand[bestCandIdx].y;
3857
                        sadBest = sadBestCand[bestCandIdx];
3858
                        mv.set(bestX, bestY);
3859
                        cost = sadBest;
3860
3861
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3862
                        return;
3863
                    }
3864
                }
3865
            }
3866
        }
3867
    }
3868
    else //full search
3869
    {
3870
        refY += (srchRngVerBottom * refStride);
3871
        int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3872
        int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3873
3874
        for (int y = srchRngVerBottom; y >= srchRngVerTop; y--)
3875
        {
3876
            if (((int)(cuPelY + y) < 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3877
            {
3878
                refY -= refStride;
3879
                continue;
3880
            }
3881
3882
            for (int x = srchRngHorLeft; x <= srchRngHorRight; x++)
3883
            {
3884
3885
                if (((int)(cuPelX + x) < 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3886
                {
3887
                    continue;
3888
                }
3889
3890
                int tempX = x + relCUPelX + roiWidth - 1;
3891
                int tempY = y + relCUPelY + roiHeight - 1;
3892
                if ((tempX >= 0) && (tempY >= 0))
3893
                {
3894
                    int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3895
                    uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3896
                    if (iTempZscanIdx >= cu.m_absIdxInCTU)
3897
                    {
3898
                        continue;
3899
                    }
3900
                }
3901
3902
                if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3903
                {
3904
                    continue;
3905
                }
3906
3907
                refSrch = refY + x;
3908
3909
                sad = m_me.bufSAD(refSrch, refStride);
3910
                sad += m_me.mvcost(MV(x, y));
3911
                if (sad < sadBest)
3912
                {
3913
                    sadBest = sad;
3914
                    bestX = x;
3915
                    bestY = y;
3916
                }
3917
                intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3918
            }
3919
3920
            refY -= refStride;
3921
        }
3922
    }
3923
3924
    bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3925
    bestX = MVCand[bestCandIdx].x;
3926
    bestY = MVCand[bestCandIdx].y;
3927
    sadBest = sadBestCand[bestCandIdx];
3928
    mv.set(bestX, bestY);
3929
    cost = sadBest;
3930
3931
    updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3932
3933
}
3934
3935
void Search::setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB)
3936
{
3937
    MV mvPred = pred;
3938
    CUData& cu = intraBCMode.cu;
3939
    cu.clipMv(mvPred);
3940
    int srLeft, srRight, srTop, srBottom;
3941
    int puWidth, puHeight;
3942
    uint32_t partAddr;
3943
3944
    cu.getPartIndexAndSize(puIdx, partAddr, puWidth, puHeight);
3945
3946
    const uint32_t lcuWidth = m_param->maxCUSize;
3947
    const uint32_t lcuHeight = m_param->maxCUSize;
3948
    const uint32_t cuPelX = cu.m_cuPelX + g_zscanToPelX[partAddr];
3949
    const uint32_t cuPelY = cu.m_cuPelY + g_zscanToPelY[partAddr];
3950
3951
    const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3952
    const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3953
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3954
    if (1 << cu.m_log2CUSize[0] == 16 && cu.m_partSize[0] == SIZE_2Nx2N && isFullFrameSearchrangeEnabled)// full frame search
3955
    {
3956
        srLeft = -1 * cuPelX;
3957
        srTop = -1 * cuPelY;
3958
3959
        srRight = picWidth - cuPelX - roiWidth;
3960
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3961
    }
3962
    else
3963
    {
3964
        const uint32_t searchWidthInCTUs = 1 << cu.m_log2CUSize[0] == 8 ? 1 : (isFullFrameSearchrangeEnabled) ? -1 : 1;
3965
        uint32_t width = 0, maxWidth = searchWidthInCTUs * lcuWidth;
3966
        for (const CUData* pTestCU = cu.m_cuLeft;
3967
            width < maxWidth && pTestCU != NULL && pTestCU->m_slice != NULL;
3968
            pTestCU = pTestCU->m_cuLeft, width += lcuWidth)
3969
        {
3970
        }
3971
        int maxXsr = (cuPelX % lcuWidth) + X265_MIN(maxWidth, width);
3972
        int maxYsr = cuPelY % lcuHeight;
3973
3974
        if (cu.m_chromaFormat == X265_CSP_I420 || cu.m_chromaFormat == X265_CSP_I422) maxXsr &= ~0x4;
3975
        if (cu.m_chromaFormat == X265_CSP_I420)                                       maxYsr &= ~0x4;
3976
3977
        srLeft = -maxXsr;
3978
        srTop = -maxYsr;
3979
3980
        srRight = lcuWidth - cuPelX % lcuWidth - roiWidth;
3981
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3982
    }
3983
3984
    if (cuPelX + srRight + roiWidth > picWidth)
3985
    {
3986
        srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3987
    }
3988
    if (cuPelY + srBottom + roiHeight > picHeight)
3989
    {
3990
        srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3991
    }
3992
3993
    searchRangeLT.x = srLeft;
3994
    searchRangeLT.y = srTop;
3995
    searchRangeRB.x = srRight;
3996
    searchRangeRB.y = srBottom;
3997
3998
    cu.clipMv(searchRangeLT);
3999
    cu.clipMv(searchRangeRB);
4000
4001
}
4002
4003
void Search::intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
4004
{
4005
    uint32_t         partAddr;
4006
    int              roiWidth;
4007
    int              roiHeight;
4008
4009
    MV   searchRangeLT;
4010
    MV   searchRangeRB;
4011
    MV   mvPred = *pred;
4012
    const MV predictors = *pred;
4013
4014
    CUData& cu = intraBCMode.cu;
4015
    cu.getPartIndexAndSize(puIdx, partAddr, roiWidth, roiHeight);
4016
4017
    int ref = m_slice->m_numRefIdx[0] - 1;
4018
    pixel* refY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->getLumaAddr(cu.m_cuAddr, cu.m_absIdxInCTU + partAddr);
4019
    int  strideY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->m_stride;
4020
4021
    setIntraSearchRange(intraBCMode, mvPred, puIdx, roiWidth, roiHeight, searchRangeLT, searchRangeRB);
4022
4023
    m_me.setMVP(predictors);
4024
4025
    intraPatternSearch(intraBCMode, cuGeom, puIdx, partAddr, refY, strideY, &searchRangeLT, &searchRangeRB, mv, cost, roiWidth, roiHeight, testOnlyPred, bUse1DSearchFor8x8, ibc);
4026
}
4027
4028
bool Search::predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
4029
{
4030
    MV zeroMv(0, 0);
4031
    CUData& cu = intraBCMode.cu;
4032
    Yuv* predYuv = &intraBCMode.predYuv;
4033
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
4034
    int  numPart = cu.getNumPartInter(0);
4035
    int log2ParallelMergeLevelMinus2 = 0;
4036
4037
    // 12 mv candidates including lowresMV
4038
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
4039
4040
    if (m_param->bEnableSCC == 1 && (1 << cu.m_log2CUSize[0]) > SCM_S0067_MAX_CAND_SIZE) // fast search
4041
        return false;
4042
4043
    uint32_t totalCost = 0;
4044
    for (int puIdx = 0; puIdx < numPart; puIdx++)
4045
    {
4046
        int width, height;
4047
        uint32_t partAddr = 0;
4048
        MotionData* bestME = intraBCMode.bestME[puIdx];
4049
        PredictionUnit pu(cu, cuGeom, puIdx);
4050
        MV  mv, mvPred[2];
4051
        cu.getPartIndexAndSize(puIdx, pu.puAbsPartIdx, width, height);
4052
        partAddr = pu.puAbsPartIdx;
4053
        m_me.setSourcePU(*intraBCMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
4054
4055
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, intraBCMode.interNeighbours);
4056
        cu.getPMV(intraBCMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, puIdx, pu.puAbsPartIdx);
4057
4058
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
4059
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
4060
4061
        uint32_t cost;
4062
        mv.set(0, 0);
4063
        intraBlockCopyEstimate(intraBCMode, cuGeom, puIdx, mvPred, mv, cost, testOnlyPred, bUse1DSearchFor8x8, ibc);
4064
4065
        bestME->mv.set(mv.x << 2, mv.y << 2);
4066
        bestME->cost = cost;
4067
        totalCost += cost;
4068
        if (mv.x == 0 && mv.y == 0)
4069
        {
4070
            if (testOnlyPred)
4071
            {
4072
                m_lastCandCost = MAX_UINT;
4073
            }
4074
            return false;
4075
        }
4076
4077
        int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
4078
        int distAMVPBest, distMergeTemp;
4079
        int costAMVPBest, costMergeBest, costMergeTemp;
4080
        bitsAMVPBest = MAX_INT;
4081
        costAMVPBest = MAX_INT;
4082
        costMergeBest = MAX_INT;
4083
        int mvpIdxBest = 0;
4084
        int mvpIdxTemp;
4085
        int mrgIdxBest = -1;
4086
        int mrgIdxTemp = -1;
4087
        int xCUStart = cu.m_cuPelX;
4088
        int yCUStart = cu.m_cuPelY;
4089
        int xStartInCU = 0, yStartInCU = 0;
4090
        if (ePartSize == SIZE_2Nx2N)
4091
            xStartInCU = yStartInCU = 0;
4092
        else if (ePartSize == SIZE_2NxN)
4093
        {
4094
            xStartInCU = 0;
4095
            yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
4096
        }
4097
        else if (ePartSize == SIZE_Nx2N)
4098
        {
4099
            xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
4100
            yStartInCU = 0;
4101
        }
4102
        const pixel* currStart;
4103
        pixel* ref;
4104
        int currStride, refStride;
4105
        distAMVPBest = 0;
4106
4107
        MV cMvQuaterPixl = mv;
4108
        cMvQuaterPixl <<= 2;
4109
        cu.setPUMv(0, cMvQuaterPixl, pu.puAbsPartIdx, puIdx);
4110
        cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
4111
        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
4112
        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
4113
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
4114
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4115
        for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4116
        {
4117
            int tempHeight, tempWidth;
4118
            if (ch == 0)
4119
            {
4120
                tempHeight = height;
4121
                tempWidth = width;
4122
                ref = tmpPredYuv.getLumaAddr(partAddr);
4123
                refStride = tmpPredYuv.m_size;
4124
                distAMVPBest += m_me.bufSAD(ref, refStride);
4125
            }
4126
            else
4127
            {
4128
                tempHeight = height >> m_vChromaShift;
4129
                tempWidth = width >> m_hChromaShift;
4130
4131
                currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
4132
                currStride = intraBCMode.fencYuv->m_csize;
4133
                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4134
                refStride = tmpPredYuv.m_csize;
4135
                distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4136
            }
4137
        }
4138
4139
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
4140
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
4141
4142
        for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4143
        {
4144
            m_me.setMVP(mvPred[mvpIdxTemp]);
4145
            bitsAMVPTemp = m_me.bitcost(mv, mvPred[mvpIdxTemp]);
4146
            if (bitsAMVPTemp < bitsAMVPBest)
4147
            {
4148
                bitsAMVPBest = bitsAMVPTemp;
4149
                mvpIdxBest = mvpIdxTemp;
4150
            }
4151
        }
4152
4153
        bitsAMVPBest++; // for MVP Index bits
4154
        costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4155
4156
        MVField cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4157
        uint8_t uhInterDirNeighbours[MRG_MAX_NUM_CANDS];
4158
        int numValidMergeCand = 0;
4159
4160
        for (int i = 0; i < MRG_MAX_NUM_CANDS; i++)
4161
        {
4162
            cMvFieldNeighbours[i][0].mv.set(0, 0);
4163
            cMvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
4164
        }
4165
4166
        if (ePartSize != SIZE_2Nx2N)
4167
        {
4168
            if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4169
            {
4170
                cu.setPartSizeSubParts(SIZE_2Nx2N);
4171
                if (puIdx == 0)
4172
                {
4173
                    numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, uhInterDirNeighbours);
4174
                }
4175
                cu.setPartSizeSubParts(ePartSize);
4176
            }
4177
            else
4178
            {
4179
                numValidMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, cMvFieldNeighbours, uhInterDirNeighbours);
4180
            }
4181
4182
            cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand);
4183
            restrictBipredMergeCand(&cu, puIdx, cMvFieldNeighbours, uhInterDirNeighbours, numValidMergeCand);
4184
4185
            for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCand; mrgIdxTemp++)
4186
            {
4187
                if (uhInterDirNeighbours[mrgIdxTemp] != 1)
4188
                {
4189
                    continue;
4190
                }
4191
                if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4192
                {
4193
                    continue;
4194
                }
4195
4196
                if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, width, height, &cu,
4197
                    xStartInCU, yStartInCU, (cMvFieldNeighbours[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighbours[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4198
                {
4199
                    continue;
4200
                }
4201
                bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4202
4203
                distMergeTemp = 0;
4204
4205
                cu.setPUMv(0, cMvFieldNeighbours[mrgIdxTemp][0].mv, pu.puAbsPartIdx, puIdx);
4206
                cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, puIdx);
4207
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
4208
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
4209
                cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
4210
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4211
4212
                for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4213
                {
4214
                    int tempHeight, tempWidth;
4215
                    if (ch == 0)
4216
                    {
4217
                        tempHeight = height;
4218
                        tempWidth = width;
4219
                        ref = tmpPredYuv.getLumaAddr(partAddr);
4220
                        refStride = tmpPredYuv.m_size;
4221
                        distMergeTemp += m_me.bufSAD(ref, refStride);
4222
                    }
4223
                    else
4224
                    {
4225
                        tempHeight = height >> m_vChromaShift;
4226
                        tempWidth = width >> m_hChromaShift;
4227
4228
                        currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
4229
                        currStride = intraBCMode.fencYuv->m_csize;
4230
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4231
                        refStride = tmpPredYuv.m_csize;
4232
                        distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4233
                    }
4234
                }
4235
                costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4236
4237
                if (costMergeTemp < costMergeBest)
4238
                {
4239
                    costMergeBest = costMergeTemp;
4240
                    mrgIdxBest = mrgIdxTemp;
4241
                }
4242
            }
4243
        }
4244
        if (costAMVPBest < costMergeBest)
4245
        {
4246
            MV tempmv((mv.x << 2), (mv.y << 2));
4247
            MVField mvField[2];
4248
            mvField[0].mv = tempmv;
4249
            mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4250
            mvField[1].mv = zeroMv;
4251
            mvField[1].refIdx = REF_NOT_VALID;
4252
4253
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
4254
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
4255
4256
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
4257
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
4258
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
4259
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
4260
4261
            MV mvd;
4262
            mvd.set(mv.x - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].x >> 2), mv.y - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].y >> 2));
4263
4264
            cu.m_mvd[0][pu.puAbsPartIdx] = mvd;
4265
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mvpIdxBest;
4266
        }
4267
        else
4268
        {
4269
            MV MV(cMvFieldNeighbours[mrgIdxBest][0].mv.x, cMvFieldNeighbours[mrgIdxBest][0].mv.y);
4270
            MVField mvField[2];
4271
            mvField[0].mv = MV;
4272
            mvField[0].refIdx = cu.m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4273
            mvField[1].mv = zeroMv;
4274
            mvField[1].refIdx = REF_NOT_VALID;
4275
4276
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4277
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mrgIdxBest; /* merge candidate ID is stored in L0 MVP idx */
4278
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
4279
4280
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
4281
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
4282
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
4283
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
4284
4285
            cu.m_mvd[0][pu.puAbsPartIdx] = zeroMv;
4286
            cu.m_mvd[1][pu.puAbsPartIdx] = zeroMv;
4287
        }
4288
        motionCompensation(cu, pu, *predYuv, 1, 1);
4289
    }
4290
4291
    PredictionUnit pu(cu, cuGeom, 0);
4292
    uint32_t abortThreshold = (1 << cu.m_log2CUSize[0]) * (1 << cu.m_log2CUSize[0]) * 2;
4293
    if (testOnlyPred)
4294
    {
4295
        if (numPart == 1 && totalCost > abortThreshold)
4296
        {
4297
            m_lastCandCost = MAX_UINT;
4298
            return false;
4299
        }
4300
        m_lastCandCost = totalCost;
4301
    }
4302
    else if (totalCost < abortThreshold && 3 * totalCost >> 2 >= m_lastCandCost)
4303
    {
4304
        return false;
4305
    }
4306
    return true;
4307
}
4308
4309
bool Search::predMixedIntraBCInterSearch(Mode& intraBCMixedMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMvCandList)
4310
{
4311
    intraBCMixedMode.initCosts();
4312
    intraBCMixedMode.cu.setPartSizeSubParts(ePartSize);
4313
    intraBCMixedMode.cu.setPredModeSubParts(MODE_INTER);
4314
    CUData& cu = intraBCMixedMode.cu;
4315
    int numComb = 2;
4316
    int numPart = 2;
4317
    uint32_t cost[2] = { 0,0 };
4318
    uint32_t maxCost = UINT32_MAX;
4319
4320
    int      numPredDir = m_slice->isInterP() ? 1 : 2;
4321
    MV       cMvZero(0, 0);
4322
4323
    MV  cMvPredCand[2][2];
4324
    int IBCValidFlag = 0;
4325
    int bestIBCMvpIdx[2] = { 0, 0 };
4326
    int bestInterMvpIdx[2] = { 0, 0 };
4327
    int bestInterDir[2] = { 0, 0 };
4328
    int bestRefIdx[2] = { 0, 0 };
4329
    bool isMergeMode[2] = { false, false };
4330
    bool isIBCMergeMode[2] = { false, false };
4331
    MVField cMRGMvField[2][2];
4332
    MVField cMRGMvFieldIBC[2][2];
4333
    int log2ParallelMergeLevelMinus2 = 0;
4334
    // 12 mv candidates including lowresMV
4335
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
4336
4337
    Yuv* predYuv = &intraBCMixedMode.predYuv;
4338
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
4339
4340
    for (int combo = 0; combo < numComb; combo++) // number of combination
4341
    {
4342
        for (int partIdx = 0; partIdx < numPart; ++partIdx)
4343
        {
4344
            int dummyWidth, dummyHeight;
4345
            uint32_t partAddr = 0;
4346
            PredictionUnit pu(cu, cuGeom, partIdx);
4347
            cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth, dummyHeight);
4348
            m_me.setSourcePU(*intraBCMixedMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
4349
4350
            MV mvPred[2];
4351
            MV bvPred[2];
4352
            if ((combo == 0 && partIdx == 0) || (combo == 1 && partIdx == 1)) // intraBC
4353
            {
4354
                MV cMv = iMvCandList[8 + partIdx];
4355
                if (cMv.x == 0 && cMv.y == 0)
4356
                {
4357
                    cost[combo] = maxCost;
4358
                    IBCValidFlag++;
4359
                    break;
4360
                }
4361
4362
                cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4363
                cu.getPMV(intraBCMixedMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, partIdx, pu.puAbsPartIdx);
4364
4365
                bvPred[0] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0];
4366
                bvPred[1] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1];
4367
                bvPred[0] >>= 2;
4368
                bvPred[1] >>= 2;
4369
4370
                /////////////////////////////////////////////////////////////
4371
                // ibc merge
4372
                // choose one MVP and compare with merge mode
4373
4374
                int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
4375
                int distAMVPBest, distMergeTemp;
4376
                int costAMVPBest, costMergeBest, costMergeTemp;
4377
                bitsAMVPBest = MAX_INT;
4378
                costAMVPBest = MAX_INT;
4379
                costMergeBest = MAX_INT;
4380
                int mvpIdxBest = 0;
4381
                int mvpIdxTemp;
4382
                int mrgIdxBest = -1;
4383
                int mrgIdxTemp = -1;
4384
                int xCUStart = cu.m_cuPelX;
4385
                int yCUStart = cu.m_cuPelY;
4386
                int xStartInCU = 0, yStartInCU = 0;
4387
                if (ePartSize == SIZE_2Nx2N)
4388
                    xStartInCU = yStartInCU = 0;
4389
                else if (ePartSize == SIZE_2NxN)
4390
                {
4391
                    xStartInCU = 0;
4392
                    yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
4393
                }
4394
                else if (ePartSize == SIZE_Nx2N)
4395
                {
4396
                    xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
4397
                    yStartInCU = 0;
4398
                }
4399
                const pixel* currStart;
4400
                int currStride;
4401
                int refStride;
4402
                distAMVPBest = 0;
4403
                pixel* ref;
4404
4405
                cu.setPUMv(0, cMv, pu.puAbsPartIdx, partIdx);
4406
                cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, partIdx);
4407
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4408
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4409
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4410
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4411
4412
                for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4413
                {
4414
                    int tempHeight, tempWidth;
4415
                    if (ch == 0)
4416
                    {
4417
                        tempHeight = dummyHeight;
4418
                        tempWidth = dummyWidth;
4419
                        ref = tmpPredYuv.getLumaAddr(partAddr);
4420
                        refStride = tmpPredYuv.m_size;
4421
                        distAMVPBest += m_me.bufSAD(ref, refStride);
4422
                    }
4423
                    else
4424
                    {
4425
                        tempHeight = dummyHeight >> m_vChromaShift;
4426
                        tempWidth = dummyWidth >> m_hChromaShift;
4427
4428
                        currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4429
                        currStride = intraBCMixedMode.fencYuv->m_csize;
4430
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4431
                        refStride = tmpPredYuv.m_csize;
4432
                        distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4433
                    }
4434
                }
4435
4436
                MV check;
4437
                for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4438
                {
4439
                    m_me.setMVP(bvPred[mvpIdxTemp]);
4440
                    bitsAMVPTemp = m_me.bitcost(cMv >> 2, bvPred[mvpIdxTemp]);
4441
                    if (bitsAMVPTemp < bitsAMVPBest)
4442
                    {
4443
                        bitsAMVPBest = bitsAMVPTemp;
4444
                        mvpIdxBest = mvpIdxTemp;
4445
                    }
4446
                }
4447
4448
                bitsAMVPBest++; // for MVP Index bits
4449
                costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4450
4451
                MVField cMvFieldNeighboursIBC[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4452
                uint8_t uhInterDirNeighboursIBC[MRG_MAX_NUM_CANDS];
4453
                int numValidMergeCandIBC = 0;
4454
4455
                if (ePartSize != SIZE_2Nx2N)
4456
                {
4457
                    if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4458
                    {
4459
                        cu.setPartSizeSubParts(SIZE_2Nx2N);
4460
                        if (partIdx == 0)
4461
                        {
4462
                            numValidMergeCandIBC = cu.getInterMergeCandidates(0, 0, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4463
                        }
4464
                        cu.setPartSizeSubParts(ePartSize);
4465
                    }
4466
                    else
4467
                    {
4468
                        numValidMergeCandIBC = cu.getInterMergeCandidates(pu.puAbsPartIdx, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4469
                    }
4470
4471
                    cu.roundMergeCandidates(cMvFieldNeighboursIBC, numValidMergeCandIBC);
4472
                    restrictBipredMergeCand(&cu, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC, numValidMergeCandIBC);
4473
4474
                    for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCandIBC; mrgIdxTemp++)
4475
                    {
4476
                        if (uhInterDirNeighboursIBC[mrgIdxTemp] != 1)
4477
                        {
4478
                            continue;
4479
                        }
4480
                        if (m_slice->m_refPOCList[0][cMvFieldNeighboursIBC[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4481
                        {
4482
                            continue;
4483
                        }
4484
4485
                        if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, dummyWidth, dummyHeight, &cu,
4486
                            xStartInCU, yStartInCU, (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4487
                        {
4488
                            continue;
4489
                        }
4490
                        bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4491
4492
                        distMergeTemp = 0;
4493
                        cu.setPUMv(0, cMvFieldNeighboursIBC[mrgIdxTemp][0].mv, pu.puAbsPartIdx, partIdx);
4494
                        cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4495
                        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4496
                        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4497
                        cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4498
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4499
4500
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4501
                        {
4502
                            int tempHeight, tempWidth;
4503
                            if (ch == 0)
4504
                            {
4505
                                tempHeight = dummyHeight;
4506
                                tempWidth = dummyWidth;
4507
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4508
                                refStride = tmpPredYuv.m_size;
4509
                                distMergeTemp += m_me.bufSAD(ref, refStride);
4510
                            }
4511
                            else
4512
                            {
4513
                                tempHeight = dummyHeight >> m_vChromaShift;
4514
                                tempWidth = dummyWidth >> m_hChromaShift;
4515
4516
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4517
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4518
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4519
                                refStride = tmpPredYuv.m_csize;
4520
                                distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4521
                            }
4522
                        }
4523
                        costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4524
4525
                        if (costMergeTemp < costMergeBest)
4526
                        {
4527
                            costMergeBest = costMergeTemp;
4528
                            mrgIdxBest = mrgIdxTemp;
4529
                        }
4530
                    }
4531
                }
4532
4533
                if (costMergeBest < costAMVPBest)
4534
                {
4535
                    cost[combo] += costMergeBest;
4536
                    isIBCMergeMode[combo] = true;
4537
                    bestIBCMvpIdx[combo] = mrgIdxBest;
4538
4539
                    MVField mvField[2];
4540
                    MV mv(cMvFieldNeighboursIBC[mrgIdxBest][0].mv.x, cMvFieldNeighboursIBC[mrgIdxBest][0].mv.y);
4541
                    mvField[0].mv = mv;
4542
                    mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4543
                    mvField[1].mv = cMvZero;
4544
                    mvField[1].refIdx = REF_NOT_VALID;
4545
                    cMRGMvFieldIBC[combo][0] = mvField[0];
4546
                    cMRGMvFieldIBC[combo][1] = mvField[1];
4547
                }
4548
                else
4549
                {
4550
                    cost[combo] += costAMVPBest;
4551
                    isIBCMergeMode[combo] = false;
4552
                    bestIBCMvpIdx[combo] = mvpIdxBest;
4553
                    cMvPredCand[combo][partIdx].set(bvPred[mvpIdxBest].x << 2, bvPred[mvpIdxBest].y << 2);
4554
                }
4555
4556
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);  // list 0 prediction
4557
                if (isIBCMergeMode[combo])
4558
                {
4559
                    cu.setPUMv(0, cMRGMvFieldIBC[combo][0].mv, pu.puAbsPartIdx, partIdx);
4560
                }
4561
                else
4562
                {
4563
                    cu.setPUMv(0, iMvCandList[8 + partIdx], pu.puAbsPartIdx, partIdx);
4564
                    cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4565
                    cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4566
                }
4567
                // ibc merge
4568
                /////////////////////////////////////////////////////////////
4569
            }
4570
            else // is inter PU
4571
            {
4572
                uint32_t  costInterTemp = 0;
4573
                uint32_t  costInterBest = UINT32_MAX;
4574
                const pixel* currStart;
4575
                int currStride;
4576
                pixel* ref;
4577
                int refStride;
4578
                MergeData merge;
4579
                memset(&merge, 0, sizeof(merge));
4580
                for (int refList = 0; refList < numPredDir; refList++)
4581
                {
4582
                    uint32_t numRef = refList ? ((m_slice->m_numRefIdx[1] > 1) ? 2 : 1) : ((m_slice->m_numRefIdx[0] - 1 > 1) ? 2 : 1);
4583
                    for (uint32_t refIdx = 0; refIdx < numRef; refIdx++)
4584
                    {
4585
                        MV cMv = iMvCandList[4 * refList + 2 * refIdx + partIdx];
4586
4587
                        cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4588
                        cu.getPMV(intraBCMixedMode.interNeighbours, refList, refIdx, intraBCMixedMode.amvpCand[refList][refIdx], mvc, partIdx, pu.puAbsPartIdx);
4589
                        int mvpIdx;
4590
4591
                        uint32_t  tempCost0 = 0;
4592
                        uint32_t  tempCost1 = 0;
4593
                        mvPred[0] = intraBCMixedMode.amvpCand[refList][refIdx][0];
4594
                        mvPred[1] = intraBCMixedMode.amvpCand[refList][refIdx][1];
4595
4596
                        m_me.setMVP(mvPred[0]);
4597
                        tempCost0 = m_me.bitcost(cMv, mvPred[0]);
4598
                        m_me.setMVP(mvPred[1]);
4599
                        tempCost1 = m_me.bitcost(cMv, mvPred[1]);
4600
                        if (tempCost1 < tempCost0)
4601
                        {
4602
                            mvpIdx = 1;
4603
                        }
4604
                        else
4605
                        {
4606
                            mvpIdx = 0;
4607
                        }
4608
                        uint32_t bitsTemp = m_listSelBits[refList] + MVP_IDX_BITS;
4609
                        bitsTemp += getTUBits(refIdx, numRef);
4610
4611
                        m_me.setMVP(mvPred[mvpIdx]);
4612
                        if (cu.m_slice->m_useIntegerMv)
4613
                        {
4614
                            cu.setPUMv(refList, (cMv >> 2) << 2, pu.puAbsPartIdx, partIdx);
4615
                        }
4616
                        else
4617
                        {
4618
                            cu.setPUMv(refList, cMv, pu.puAbsPartIdx, partIdx);
4619
                        }
4620
                        cu.setPURefIdx(refList, refIdx, pu.puAbsPartIdx, partIdx);
4621
                        cu.setPUInterDir(1 + refList, pu.puAbsPartIdx, partIdx);
4622
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4623
4624
                        costInterTemp = 0;
4625
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4626
                        {
4627
                            int tempHeight, tempWidth;
4628
                            if (ch == 0)
4629
                            {
4630
                                tempHeight = dummyHeight;
4631
                                tempWidth = dummyWidth;
4632
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4633
                                refStride = tmpPredYuv.m_size;
4634
                                costInterTemp += m_me.bufSAD(ref, refStride);
4635
                            }
4636
                            else
4637
                            {
4638
                                tempHeight = dummyHeight >> m_vChromaShift;
4639
                                tempWidth = dummyWidth >> m_hChromaShift;
4640
4641
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4642
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4643
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4644
                                refStride = tmpPredYuv.m_csize;
4645
                                costInterTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4646
                            }
4647
4648
                            if (costInterTemp >= costInterBest)
4649
                            {
4650
                                break;
4651
                            }
4652
                        }
4653
                        cu.setPURefIdx(refList, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4654
4655
                        costInterTemp += m_me.bitcost(cMv, mvPred[mvpIdx]);
4656
                        costInterTemp += m_rdCost.getCost(bitsTemp);
4657
4658
                        if (costInterTemp < costInterBest)
4659
                        {
4660
                            costInterBest = costInterTemp;
4661
                            bestInterMvpIdx[combo] = mvpIdx;
4662
                            bestInterDir[combo] = refList;
4663
                            bestRefIdx[combo] = refIdx;
4664
                            cMvPredCand[combo][partIdx] = mvPred[mvpIdx];
4665
                        }
4666
                    }
4667
                } // end RefIdx and RefList search
4668
4669
                uint32_t MRGInterDir = 0;
4670
                uint32_t MRGIndex = 0;
4671
4672
                // find Merge result
4673
                uint32_t MRGCost = UINT32_MAX;
4674
                cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4675
4676
                mergeEstimation(cu, cuGeom, pu, partIdx, merge);
4677
                MRGInterDir = merge.dir;
4678
                cMRGMvField[combo][0] = merge.mvField[0];
4679
                cMRGMvField[combo][1] = merge.mvField[1];
4680
                MRGIndex = merge.index;
4681
                cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4682
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4683
4684
                if (MRGCost < costInterBest)
4685
                {
4686
                    costInterBest = MRGCost;
4687
                    isMergeMode[combo] = true;
4688
                    bestInterMvpIdx[combo] = MRGIndex;
4689
                    bestInterDir[combo] = MRGInterDir;
4690
                }
4691
4692
                cost[combo] += costInterBest;
4693
                if (isMergeMode[combo])
4694
                {
4695
                    cu.setPUInterDir(bestInterDir[combo], pu.puAbsPartIdx, partIdx);
4696
                    cu.setPUMv(0, cMRGMvField[combo][0].mv, pu.puAbsPartIdx, partIdx);
4697
                    cu.setPURefIdx(0, cMRGMvField[combo][0].refIdx, pu.puAbsPartIdx, partIdx);
4698
                    cu.setPUMv(1, cMRGMvField[combo][1].mv, pu.puAbsPartIdx, partIdx);
4699
                    cu.setPURefIdx(1, cMRGMvField[combo][1].refIdx, pu.puAbsPartIdx, partIdx);
4700
                }
4701
                else
4702
                {
4703
                    int refListOpt = bestInterDir[combo];
4704
                    int refIdxOpt = bestRefIdx[combo];
4705
                    if (cu.m_slice->m_useIntegerMv)
4706
                    {
4707
                        cu.setPUMv(refListOpt, (iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, pu.puAbsPartIdx, partIdx);
4708
                    }
4709
                    else
4710
                    {
4711
                        cu.setPUMv(refListOpt, iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt], pu.puAbsPartIdx, partIdx);
4712
                    }
4713
                    cu.setPURefIdx(refListOpt, refIdxOpt, pu.puAbsPartIdx, partIdx);
4714
                    cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4715
                    cu.setPUInterDir(1 + refListOpt, pu.puAbsPartIdx, partIdx);
4716
                    cu.m_mvpIdx[refListOpt][pu.puAbsPartIdx] = bestInterMvpIdx[combo];
4717
                }
4718
            }
4719
        } // for ipartIdx
4720
    } // for combo
4721
4722
    if (IBCValidFlag > 1)
4723
    {
4724
        return false;
4725
    }
4726
4727
    MV cMvd;
4728
    MV cMVFinal;
4729
    if (cost[0] <= cost[1])
4730
    {
4731
        int iDummyWidth1, iDummyHeight1;
4732
        uint32_t partAddr = 0;
4733
        uint32_t partIdx = 0;
4734
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4735
4736
        if (isIBCMergeMode[0])
4737
        {
4738
            cu.m_mergeFlag[partAddr] = true;
4739
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4740
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4741
            cu.setPUMv(0, cMRGMvFieldIBC[0][0].mv, partAddr, partIdx);
4742
            cu.setPURefIdx(0, cMRGMvFieldIBC[0][0].refIdx, partAddr, partIdx);
4743
            cu.setPUMv(1, cMRGMvFieldIBC[0][1].mv, partAddr, partIdx);
4744
            cu.setPURefIdx(1, cMRGMvFieldIBC[0][1].refIdx, partAddr, partIdx);
4745
4746
            cu.m_mvd[0][partAddr] = cMvZero;
4747
            cu.m_mvd[1][partAddr] = cMvZero;
4748
        }
4749
        else
4750
        {
4751
            cu.m_mergeFlag[partAddr] = false;
4752
4753
            cMvd.set((iMvCandList[8].x - cMvPredCand[0][0].x) >> 2, (iMvCandList[8].y - cMvPredCand[0][0].y) >> 2);
4754
            cu.setPUMv(0, iMvCandList[8], partAddr, partIdx);
4755
            cu.m_mvd[0][partAddr] = cMvd;
4756
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4757
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4758
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4759
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4760
        }
4761
4762
        partIdx = 1;
4763
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4764
4765
        if (isMergeMode[0])
4766
        {
4767
            cu.m_mergeFlag[partAddr] = true;
4768
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[0];
4769
            cu.setPUInterDir(bestInterDir[0], partAddr, partIdx);  // list 0 prediction
4770
            cu.setPUMv(0, cMRGMvField[0][0].mv, partAddr, partIdx);
4771
            cu.setPURefIdx(0, cMRGMvField[0][0].refIdx, partAddr, partIdx);
4772
            cu.setPUMv(1, cMRGMvField[0][1].mv, partAddr, partIdx);
4773
            cu.setPURefIdx(1, cMRGMvField[0][1].refIdx, partAddr, partIdx);
4774
4775
            cu.m_mvd[0][partAddr] = cMvZero;
4776
            cu.m_mvd[1][partAddr] = cMvZero;
4777
        }
4778
        else
4779
        {
4780
            int refListOpt = bestInterDir[0];
4781
            int refIdxOpt = bestRefIdx[0];
4782
            if (cu.m_slice->m_useIntegerMv)
4783
            {
4784
                cMvd.set(((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[0][1].x >> 2)), ((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[0][1].y >> 2)));
4785
                cu.setPUMv(refListOpt, (iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4786
            }
4787
            else
4788
            {
4789
                cMvd.set(iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[0][1].x, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[0][1].y);
4790
                cu.setPUMv(refListOpt, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4791
            }
4792
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4793
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4794
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4795
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4796
            cu.m_mergeFlag[partAddr] = false;
4797
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[0];
4798
        }
4799
    }
4800
    else
4801
    {
4802
        int dummyWidth2, dummyHeight2;
4803
        uint32_t partAddr = 0;
4804
        uint32_t partIdx = 0;
4805
4806
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4807
4808
        if (isMergeMode[1])
4809
        {
4810
            cu.m_mergeFlag[partAddr] = true;
4811
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[1];
4812
            cu.setPUInterDir(bestInterDir[1], partAddr, partIdx);  // list 0 prediction
4813
            cu.setPUMv(0, cMRGMvField[1][0].mv, partAddr, partIdx);
4814
            cu.setPURefIdx(0, cMRGMvField[1][0].refIdx, partAddr, partIdx);
4815
            cu.setPUMv(1, cMRGMvField[1][1].mv, partAddr, partIdx);
4816
            cu.setPURefIdx(1, cMRGMvField[1][1].refIdx, partAddr, partIdx);
4817
4818
            cu.m_mvd[0][partAddr] = cMvZero;
4819
            cu.m_mvd[1][partAddr] = cMvZero;
4820
        }
4821
        else
4822
        {
4823
            int refListOpt = bestInterDir[1];
4824
            int refIdxOpt = bestRefIdx[1];
4825
            if (cu.m_slice->m_useIntegerMv)
4826
            {
4827
                cMvd.set((iMvCandList[2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[1][0].x >> 2), (iMvCandList[2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[1][0].y >> 2));
4828
                cu.setPUMv(refListOpt, (iMvCandList[2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4829
            }
4830
            else
4831
            {
4832
                cMvd.set(iMvCandList[2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[1][0].x, iMvCandList[2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[1][0].y);
4833
                cu.setPUMv(refListOpt, iMvCandList[2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4834
            }
4835
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4836
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4837
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4838
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4839
            cu.m_mergeFlag[partAddr] = false;
4840
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[1];
4841
        }
4842
4843
        partIdx = 1;
4844
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4845
4846
        if (isIBCMergeMode[1])
4847
        {
4848
            cu.m_mergeFlag[partAddr] = true;
4849
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4850
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4851
            cu.setPUMv(0, cMRGMvFieldIBC[1][0].mv, partAddr, partIdx);
4852
            cu.setPURefIdx(0, cMRGMvFieldIBC[1][0].refIdx, partAddr, partIdx);
4853
            cu.setPUMv(1, cMRGMvFieldIBC[1][1].mv, partAddr, partIdx);
4854
            cu.setPURefIdx(1, cMRGMvFieldIBC[1][1].refIdx, partAddr, partIdx);
4855
4856
            cu.m_mvd[0][partAddr] = cMvZero;
4857
            cu.m_mvd[1][partAddr] = cMvZero;
4858
        }
4859
        else
4860
        {
4861
            cu.m_mergeFlag[partAddr] = false;
4862
4863
            cMvd.set(((iMvCandList[9].x - cMvPredCand[1][1].x) >> 2), (iMvCandList[9].y - cMvPredCand[1][1].y) >> 2);
4864
            cu.setPUMv(0, iMvCandList[9], partAddr, partIdx);
4865
            cu.m_mvd[0][partAddr] = cMvd;
4866
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4867
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4868
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4869
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4870
        }
4871
    }
4872
    for (int partIdx = 0; partIdx < numPart; ++partIdx)
4873
    {
4874
        PredictionUnit pu(cu, cuGeom, partIdx);
4875
        motionCompensation(cu, pu, *predYuv, 1, 1);
4876
    }
4877
4878
    return true;
4879
}
4880
#endif
4881
4882
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
4883
0
{
4884
0
    if (cuMode == SIZE_2Nx2N)
4885
0
    {
4886
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4887
0
        blockBit[1] = 3;
4888
0
        blockBit[2] = 5;
4889
0
    }
4890
0
    else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
4891
0
    {
4892
0
        static const uint32_t listBits[2][3][3] =
4893
0
        {
4894
0
            { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4895
0
            { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
4896
0
        };
4897
0
        if (bPSlice)
4898
0
        {
4899
0
            blockBit[0] = 3;
4900
0
            blockBit[1] = 0;
4901
0
            blockBit[2] = 0;
4902
0
        }
4903
0
        else
4904
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4905
0
    }
4906
0
    else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
4907
0
    {
4908
0
        static const uint32_t listBits[2][3][3] =
4909
0
        {
4910
0
            { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4911
0
            { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
4912
0
        };
4913
0
        if (bPSlice)
4914
0
        {
4915
0
            blockBit[0] = 3;
4916
0
            blockBit[1] = 0;
4917
0
            blockBit[2] = 0;
4918
0
        }
4919
0
        else
4920
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4921
0
    }
4922
0
    else if (cuMode == SIZE_NxN)
4923
0
    {
4924
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4925
0
        blockBit[1] = 3;
4926
0
        blockBit[2] = 5;
4927
0
    }
4928
0
    else
4929
0
    {
4930
0
        X265_CHECK(0, "getBlkBits: unknown cuMode\n");
4931
0
    }
4932
0
}
4933
4934
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
4935
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
4936
0
{
4937
0
    int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
4938
0
    if (diffBits < 0)
4939
0
    {
4940
0
        mvpIdx = !mvpIdx;
4941
0
        uint32_t origOutBits = outBits;
4942
0
        outBits = origOutBits + diffBits;
4943
0
        outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4944
0
    }
4945
0
    return amvpCand[mvpIdx];
4946
0
}
4947
4948
/* Update to default MVP when using an alternative mvp */
4949
void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
4950
0
{
4951
0
    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
4952
0
    uint32_t origOutBits = outBits;
4953
0
    outBits = origOutBits + diffBits;
4954
0
    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4955
0
}
4956
4957
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
4958
0
{
4959
0
    MV dist((int32_t)merange << 2, (int32_t)merange << 2);
4960
0
    mvmin = mvp - dist;
4961
0
    mvmax = mvp + dist;
4962
4963
0
    if (m_vertRestriction)
4964
0
    {
4965
0
        int mvRestricted = (56 - 1) << 2; // -1 to consider subpel search
4966
0
        if (mvmax.y >= mvRestricted)
4967
0
        {
4968
0
            mvmax.y = mvRestricted; //only positive side is restricted
4969
0
        }
4970
0
    }
4971
4972
0
    cu.clipMv(mvmin);
4973
0
    cu.clipMv(mvmax);
4974
4975
0
    if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
4976
0
          cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
4977
0
          m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
4978
0
    {
4979
0
        int safeX, maxSafeMv;
4980
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
4981
0
        maxSafeMv = (safeX - cu.m_cuPelX) * 4;
4982
0
        mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
4983
0
        mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
4984
0
    }
4985
4986
    // apply restrict on slices
4987
0
    if ((m_param->maxSlices > 1) & m_bFrameParallel)
4988
0
    {
4989
0
        mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
4990
0
        mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
4991
0
    }
4992
4993
    /* Clip search range to signaled maximum MV length.
4994
     * We do not support this VUI field being changed from the default */
4995
0
    const int maxMvLen = (1 << 15) - 1;
4996
0
    mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
4997
0
    mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
4998
0
    mvmax.x = X265_MIN(mvmax.x, maxMvLen);
4999
0
    mvmax.y = X265_MIN(mvmax.y, maxMvLen);
5000
5001
0
    mvmin >>= 2;
5002
0
    mvmax >>= 2;
5003
5004
    /* conditional clipping for frame parallelism */
5005
0
    mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels);
5006
0
    mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels);
5007
5008
    /* conditional clipping for negative mv range */
5009
0
    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
5010
0
}
5011
5012
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
5013
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
5014
0
{
5015
0
    CUData& cu = interMode.cu;
5016
0
    Yuv* reconYuv = &interMode.reconYuv;
5017
0
    const Yuv* fencYuv = interMode.fencYuv;
5018
0
    Yuv* predYuv = &interMode.predYuv;
5019
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
5020
0
    uint32_t depth  = cu.m_cuDepth[0];
5021
5022
    // No residual coding : SKIP mode
5023
5024
0
    cu.setPredModeSubParts(MODE_SKIP);
5025
0
    cu.clearCbf();
5026
0
    cu.setTUDepthSubParts(0, 0, depth);
5027
5028
0
    reconYuv->copyFromYuv(interMode.predYuv);
5029
5030
    // Luma
5031
0
    int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
5032
0
    interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5033
0
    interMode.distortion = interMode.lumaDistortion;
5034
    // Chroma
5035
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5036
0
    {
5037
0
        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
5038
0
        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
5039
0
        interMode.distortion += interMode.chromaDistortion;
5040
0
    }
5041
0
    cu.m_distortion[0] = interMode.distortion;
5042
0
    m_entropyCoder.load(m_rqt[depth].cur);
5043
0
    m_entropyCoder.resetBits();
5044
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
5045
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
5046
0
    m_entropyCoder.codeSkipFlag(cu, 0);
5047
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5048
0
    m_entropyCoder.codeMergeIndex(cu, 0);
5049
0
    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5050
0
    interMode.coeffBits = 0;
5051
0
    interMode.totalBits = interMode.mvBits + skipFlagBits;
5052
0
    if (m_rdCost.m_psyRd)
5053
0
        interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5054
0
    else if(m_rdCost.m_ssimRd)
5055
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
5056
5057
0
    interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5058
0
    updateModeCost(interMode);
5059
0
    m_entropyCoder.store(interMode.contexts);
5060
0
}
5061
5062
/* encode residual and calculate rate-distortion for a CU block.
5063
 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
5064
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
5065
0
{
5066
0
    ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
5067
5068
0
    CUData& cu = interMode.cu;
5069
0
    Yuv* reconYuv = &interMode.reconYuv;
5070
0
    Yuv* predYuv = &interMode.predYuv;
5071
0
    uint32_t depth = cuGeom.depth;
5072
0
    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
5073
0
    const Yuv* fencYuv = interMode.fencYuv;
5074
5075
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
5076
5077
0
    uint32_t log2CUSize = cuGeom.log2CUSize;
5078
0
    int sizeIdx = log2CUSize - 2;
5079
5080
0
    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
5081
5082
0
    uint32_t tuDepthRange[2];
5083
0
    cu.getInterTUQtDepthRange(tuDepthRange, 0);
5084
5085
0
    m_entropyCoder.load(m_rqt[depth].cur);
5086
5087
0
    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
5088
0
        m_maxTUDepth = -1;
5089
0
    else if (m_limitTU & X265_TU_LIMIT_BFS)
5090
0
        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
5091
5092
0
    Cost costs;
5093
0
    if (m_limitTU & X265_TU_LIMIT_NEIGH)
5094
0
    {
5095
        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
5096
0
        int32_t tempDepth = m_maxTUDepth;
5097
0
        if (m_maxTUDepth != -1)
5098
0
        {
5099
0
            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
5100
0
            uint32_t minSize = tuDepthRange[0];
5101
0
            uint32_t maxSize = tuDepthRange[1];
5102
0
            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
5103
0
            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
5104
0
        }
5105
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
5106
0
        m_maxTUDepth = tempDepth;
5107
0
    }
5108
0
    else
5109
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
5110
5111
0
    uint32_t tqBypass = cu.m_tqBypass[0];
5112
0
    if (!tqBypass)
5113
0
    {
5114
0
        sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5115
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5116
0
        {
5117
0
            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
5118
0
            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
5119
0
        }
5120
5121
        /* Consider the RD cost of not signaling any residual */
5122
0
        m_entropyCoder.load(m_rqt[depth].cur);
5123
0
        m_entropyCoder.resetBits();
5124
0
        m_entropyCoder.codeQtRootCbfZero();
5125
0
        uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
5126
5127
0
        uint32_t cbf0Energy; uint64_t cbf0Cost;
5128
0
        if (m_rdCost.m_psyRd)
5129
0
        {
5130
0
            cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5131
0
            cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
5132
0
        }
5133
0
        else if(m_rdCost.m_ssimRd)
5134
0
        {
5135
0
            cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
5136
0
            cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
5137
0
        }
5138
0
        else
5139
0
            cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
5140
5141
0
        if (cbf0Cost < costs.rdcost)
5142
0
        {
5143
0
            cu.clearCbf();
5144
0
            cu.setTUDepthSubParts(0, 0, depth);
5145
0
        }
5146
0
    }
5147
5148
0
    if (cu.getQtRootCbf(0))
5149
0
        saveResidualQTData(cu, *resiYuv, 0, 0);
5150
5151
    /* calculate signal bits for inter/merge/skip coded CU */
5152
0
    m_entropyCoder.load(m_rqt[depth].cur);
5153
5154
0
    m_entropyCoder.resetBits();
5155
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
5156
0
        m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
5157
5158
0
    uint32_t coeffBits, bits, mvBits;
5159
0
    if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
5160
0
    {
5161
0
        cu.setPredModeSubParts(MODE_SKIP);
5162
5163
        /* Merge/Skip */
5164
0
        coeffBits = mvBits = 0;
5165
0
        m_entropyCoder.codeSkipFlag(cu, 0);
5166
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5167
0
        m_entropyCoder.codeMergeIndex(cu, 0);
5168
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5169
0
        bits = mvBits + skipFlagBits;
5170
0
    }
5171
0
    else
5172
0
    {
5173
0
        m_entropyCoder.codeSkipFlag(cu, 0);
5174
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5175
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
5176
0
        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
5177
0
        m_entropyCoder.codePredInfo(cu, 0);
5178
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5179
5180
0
        bool bCodeDQP = m_slice->m_pps->bUseDQP;
5181
0
        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
5182
0
        bits = m_entropyCoder.getNumberOfWrittenBits();
5183
5184
0
        coeffBits = bits - mvBits - skipFlagBits;
5185
0
    }
5186
5187
0
    m_entropyCoder.store(interMode.contexts);
5188
5189
0
    if (cu.getQtRootCbf(0))
5190
0
        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
5191
0
    else
5192
0
        reconYuv->copyFromYuv(*predYuv);
5193
5194
    // update with clipped distortion and cost (qp estimation loop uses unclipped values)
5195
0
    sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5196
0
    interMode.distortion = bestLumaDist;
5197
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5198
0
    {
5199
0
        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
5200
0
        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
5201
0
        interMode.chromaDistortion = bestChromaDist;
5202
0
        interMode.distortion += bestChromaDist;
5203
0
    }
5204
0
    if (m_rdCost.m_psyRd)
5205
0
        interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5206
0
    else if(m_rdCost.m_ssimRd)
5207
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
5208
5209
0
    interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5210
0
    interMode.totalBits = bits;
5211
0
    interMode.lumaDistortion = bestLumaDist;
5212
0
    interMode.coeffBits = coeffBits;
5213
0
    interMode.mvBits = mvBits;
5214
0
    cu.m_distortion[0] = interMode.distortion;
5215
0
    updateModeCost(interMode);
5216
0
    checkDQP(interMode, cuGeom);
5217
5218
#if ENABLE_SCC_EXT
5219
    if (m_param->bEnableSCC)
5220
        interMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
5221
#endif
5222
0
}
5223
5224
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
5225
0
{
5226
0
    uint32_t depth = cuGeom.depth + tuDepth;
5227
0
    CUData& cu = mode.cu;
5228
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5229
5230
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5231
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
5232
0
        bCheckFull = false;
5233
5234
0
    if (bCheckFull)
5235
0
    {
5236
        // code full block
5237
0
        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5238
0
        uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5239
5240
0
        uint32_t tuDepthC = tuDepth;
5241
0
        if (log2TrSizeC < 2)
5242
0
        {
5243
0
            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5244
0
            log2TrSizeC = 2;
5245
0
            tuDepthC--;
5246
0
            codeChroma &= !(absPartIdx & 3);
5247
0
        }
5248
5249
0
        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5250
0
        uint32_t setCbf = 1 << tuDepth;
5251
5252
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5253
0
        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
5254
5255
0
        uint32_t sizeIdx  = log2TrSize  - 2;
5256
5257
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5258
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5259
5260
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
5261
0
        const Yuv* fencYuv = mode.fencYuv;
5262
5263
0
        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
5264
0
        uint32_t strideResiY = resiYuv.m_size;
5265
5266
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5267
0
        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5268
5269
0
        if (numSigY)
5270
0
        {
5271
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
5272
0
            cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
5273
0
        }
5274
0
        else
5275
0
        {
5276
0
            primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5277
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
5278
0
        }
5279
5280
0
        if (codeChroma)
5281
0
        {
5282
0
            uint32_t sizeIdxC = log2TrSizeC - 2;
5283
0
            uint32_t strideResiC = resiYuv.m_csize;
5284
5285
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5286
0
            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
5287
0
            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
5288
0
            bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5289
5290
0
            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5291
0
            do
5292
0
            {
5293
0
                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5294
0
                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5295
5296
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5297
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5298
5299
0
                int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
5300
0
                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
5301
0
                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
5302
0
                if (numSigU)
5303
0
                {
5304
0
                    m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
5305
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5306
0
                }
5307
0
                else
5308
0
                {
5309
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
5310
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5311
0
                }
5312
5313
0
                int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
5314
0
                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
5315
0
                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
5316
0
                if (numSigV)
5317
0
                {
5318
0
                    m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
5319
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5320
0
                }
5321
0
                else
5322
0
                {
5323
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
5324
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5325
0
                }
5326
0
            }
5327
0
            while (tuIterator.isNextSection());
5328
5329
0
            if (splitIntoSubTUs)
5330
0
            {
5331
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5332
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5333
0
            }
5334
0
        }
5335
0
    }
5336
0
    else
5337
0
    {
5338
0
        X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
5339
5340
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5341
0
        uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
5342
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
5343
0
        {
5344
0
            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
5345
0
            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
5346
0
            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5347
0
            {
5348
0
                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
5349
0
                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
5350
0
            }
5351
0
        }
5352
0
        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
5353
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5354
0
        {
5355
0
            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
5356
0
            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
5357
0
        }
5358
0
    }
5359
0
}
5360
5361
uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
5362
0
{
5363
0
    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
5364
5365
0
    if (m_rdCost.m_psyRd)
5366
0
        return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
5367
0
    else if(m_rdCost.m_ssimRd)
5368
0
        return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
5369
0
    else
5370
0
        return m_rdCost.calcRdCost(dist, nullBits);
5371
0
}
5372
5373
bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
5374
0
{
5375
0
    CUData& cu = mode.cu;
5376
0
    uint32_t depth = cuGeom.depth + tuDepth;
5377
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5378
5379
0
    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5380
0
    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
5381
0
    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
5382
0
    {
5383
0
        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
5384
0
        {
5385
0
            m_maxTUDepth = cu.m_tuDepth[0];
5386
            // Fetch maximum TU depth of first sub partition to limit recursion of others
5387
0
            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
5388
0
                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
5389
0
        }
5390
0
        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
5391
0
        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
5392
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5393
0
        {
5394
0
            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
5395
0
            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
5396
0
        }
5397
0
    }
5398
0
    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
5399
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5400
0
    {
5401
0
        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
5402
0
        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
5403
0
    }
5404
5405
    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
5406
    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
5407
    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
5408
    // at depth 0 (for example).
5409
0
    m_entropyCoder.load(m_rqt[depth].rqtRoot);
5410
0
    m_entropyCoder.resetBits();
5411
0
    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
5412
0
    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
5413
0
    splitCost.bits += splitCbfBits;
5414
5415
0
    if (m_rdCost.m_psyRd)
5416
0
        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
5417
0
    else if(m_rdCost.m_ssimRd)
5418
0
        splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
5419
0
    else
5420
0
        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
5421
        
5422
0
    return ycbf || ucbf || vcbf;
5423
0
}
5424
5425
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
5426
0
{
5427
0
    CUData& cu = mode.cu;
5428
0
    uint32_t depth = cuGeom.depth + tuDepth;
5429
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5430
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
5431
5432
0
    bool bCheckSplit = log2TrSize > depthRange[0];
5433
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5434
0
    bool bSaveTUData = false, bLoadTUData = false;
5435
0
    uint32_t idx = 0;
5436
5437
0
    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
5438
0
    {
5439
0
        if (bCheckSplit && bCheckFull && tuDepth)
5440
0
        {
5441
0
            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
5442
0
            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
5443
0
            idx = (depth - 1) * 4 + qIdx;
5444
0
            if (splitMore)
5445
0
            {
5446
0
                bLoadTUData = true;
5447
0
                bCheckFull = false;
5448
0
            }
5449
0
            else
5450
0
            {
5451
0
                bSaveTUData = true;
5452
0
                bCheckSplit = false;
5453
0
            }
5454
0
        }
5455
0
    }
5456
0
    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
5457
0
    {
5458
0
        if (bCheckSplit && m_maxTUDepth >= 0)
5459
0
        {
5460
0
            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
5461
0
            bCheckSplit = log2TrSize > log2MaxTrSize;
5462
0
        }
5463
0
    }
5464
5465
0
    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
5466
5467
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
5468
0
        bCheckFull = false;
5469
5470
0
    X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
5471
5472
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5473
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5474
0
    uint32_t tuDepthC = tuDepth;
5475
0
    if (log2TrSizeC < 2)
5476
0
    {
5477
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5478
0
        log2TrSizeC = 2;
5479
0
        tuDepthC--;
5480
0
        codeChroma &= !(absPartIdx & 3);
5481
0
    }
5482
5483
    // code full block
5484
0
    Cost fullCost;
5485
0
    fullCost.rdcost = MAX_INT64;
5486
5487
0
    uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5488
0
    uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5489
0
    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5490
0
    sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5491
0
    uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5492
0
    uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5493
0
    uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
5494
5495
0
    m_entropyCoder.store(m_rqt[depth].rqtRoot);
5496
5497
0
    uint32_t trSize = 1 << log2TrSize;
5498
0
    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5499
0
    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5500
0
    const Yuv* fencYuv = mode.fencYuv;
5501
5502
    // code full block
5503
0
    if (bCheckFull)
5504
0
    {
5505
0
        uint32_t trSizeC = 1 << log2TrSizeC;
5506
0
        int partSize = partitionFromLog2Size(log2TrSize);
5507
0
        int partSizeC = partitionFromLog2Size(log2TrSizeC);
5508
0
        const uint32_t qtLayer = log2TrSize - 2;
5509
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5510
0
        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
5511
5512
0
        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
5513
0
        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
5514
0
        bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
5515
5516
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5517
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5518
5519
0
        if (bEnableRDOQ)
5520
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5521
5522
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5523
0
        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
5524
0
        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5525
0
        cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
5526
5527
0
        m_entropyCoder.resetBits();
5528
5529
0
        if (bSplitPresentFlag && log2TrSize > depthRange[0])
5530
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
5531
5532
0
        if (cbfFlag[TEXT_LUMA][0])
5533
0
            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
5534
0
        singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
5535
5536
0
        X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
5537
5538
        //Assuming zero residual 
5539
0
        sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5540
0
        uint32_t zeroEnergyY = 0;
5541
0
        if (m_rdCost.m_psyRd)
5542
0
            zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5543
0
        else if(m_rdCost.m_ssimRd)
5544
0
            zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
5545
5546
0
        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
5547
0
        uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
5548
5549
0
        if (cbfFlag[TEXT_LUMA][0])
5550
0
        {
5551
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
5552
5553
            // non-zero cost calculation for luma - This is an approximation
5554
            // finally we have to encode correct cbf after comparing with null cost
5555
0
            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
5556
0
            bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
5557
0
            uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
5558
0
            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5559
0
            bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
5560
0
            bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
5561
0
            primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
5562
5563
0
            const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
5564
0
            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
5565
0
            uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
5566
0
            if (m_rdCost.m_psyRd)
5567
0
            {
5568
0
                nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
5569
0
                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5570
0
            }
5571
0
            else if(m_rdCost.m_ssimRd)
5572
0
            {
5573
0
                nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
5574
0
                singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5575
0
            }
5576
0
            else
5577
0
                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
5578
5579
0
            if (cu.m_tqBypass[0])
5580
0
            {
5581
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5582
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5583
0
            }
5584
0
            else
5585
0
            {
5586
                // zero-cost calculation for luma. This is an approximation
5587
                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
5588
                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
5589
0
                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5590
5591
0
                if (nullCostY < singleCostY)
5592
0
                {
5593
0
                    cbfFlag[TEXT_LUMA][0] = 0;
5594
0
                    singleBits[TEXT_LUMA][0] = 0;
5595
0
                    primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5596
#if CHECKED_BUILD || _DEBUG
5597
                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
5598
                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
5599
#endif
5600
0
                    if (checkTransformSkipY)
5601
0
                        minCost[TEXT_LUMA][0] = nullCostY;
5602
0
                    singleDist[TEXT_LUMA][0] = zeroDistY;
5603
0
                    singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5604
0
                }
5605
0
                else
5606
0
                {
5607
0
                    if (checkTransformSkipY)
5608
0
                        minCost[TEXT_LUMA][0] = singleCostY;
5609
0
                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
5610
0
                    singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5611
0
                }
5612
0
            }
5613
0
        }
5614
0
        else
5615
0
        {
5616
0
            if (checkTransformSkipY)
5617
0
                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5618
0
            primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5619
0
            singleDist[TEXT_LUMA][0] = zeroDistY;
5620
0
            singleBits[TEXT_LUMA][0] = 0;
5621
0
            singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5622
0
        }
5623
5624
0
        cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5625
5626
0
        if (codeChroma)
5627
0
        {
5628
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5629
0
            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
5630
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5631
0
            {
5632
0
                sse_t zeroDistC = 0;
5633
0
                uint32_t zeroEnergyC = 0;
5634
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5635
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5636
5637
0
                do
5638
0
                {
5639
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5640
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5641
5642
0
                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5643
5644
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5645
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5646
5647
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5648
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5649
0
                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
5650
0
                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
5651
5652
0
                    uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
5653
0
                    if (cbfFlag[chromaId][tuIterator.section])
5654
0
                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5655
5656
0
                    singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
5657
5658
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5659
0
                    zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
5660
5661
                    // Assuming zero residual 
5662
0
                    if (m_rdCost.m_psyRd)
5663
0
                        zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
5664
0
                    else if(m_rdCost.m_ssimRd)
5665
0
                        zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5666
5667
0
                    if (cbfFlag[chromaId][tuIterator.section])
5668
0
                    {
5669
0
                        m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
5670
0
                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
5671
5672
                        // non-zero cost calculation for luma, same as luma - This is an approximation
5673
                        // finally we have to encode correct cbf after comparing with null cost
5674
0
                        pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
5675
0
                        uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
5676
0
                        bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5677
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5678
0
                        bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5679
0
                        bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
5680
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
5681
0
                        sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
5682
0
                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
5683
0
                        uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
5684
0
                        if (m_rdCost.m_psyRd)
5685
0
                        {
5686
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
5687
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5688
0
                        }
5689
0
                        else if(m_rdCost.m_ssimRd)
5690
0
                        {
5691
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5692
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5693
0
                        }
5694
0
                        else
5695
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
5696
5697
0
                        if (cu.m_tqBypass[0])
5698
0
                        {
5699
0
                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5700
0
                            singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5701
0
                        }
5702
0
                        else
5703
0
                        {
5704
                            //zero-cost calculation for chroma. This is an approximation
5705
0
                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
5706
5707
0
                            if (nullCostC < singleCostC)
5708
0
                            {
5709
0
                                cbfFlag[chromaId][tuIterator.section] = 0;
5710
0
                                singleBits[chromaId][tuIterator.section] = 0;
5711
0
                                primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5712
#if CHECKED_BUILD || _DEBUG
5713
                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5714
                                memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
5715
#endif
5716
0
                                if (checkTransformSkipC)
5717
0
                                    minCost[chromaId][tuIterator.section] = nullCostC;
5718
0
                                singleDist[chromaId][tuIterator.section] = zeroDistC;
5719
0
                                singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5720
0
                            }
5721
0
                            else
5722
0
                            {
5723
0
                                if (checkTransformSkipC)
5724
0
                                    minCost[chromaId][tuIterator.section] = singleCostC;
5725
0
                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5726
0
                                singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5727
0
                            }
5728
0
                        }
5729
0
                    }
5730
0
                    else
5731
0
                    {
5732
0
                        if (checkTransformSkipC)
5733
0
                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
5734
0
                        primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5735
0
                        singleBits[chromaId][tuIterator.section] = 0;
5736
0
                        singleDist[chromaId][tuIterator.section] = zeroDistC;
5737
0
                        singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5738
0
                    }
5739
5740
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5741
0
                }
5742
0
                while (tuIterator.isNextSection());
5743
0
            }
5744
0
        }
5745
5746
0
        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
5747
0
        {
5748
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5749
0
            {
5750
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5751
0
                do
5752
0
                {
5753
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5754
0
                    cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5755
0
                }
5756
0
                while(tuIterator.isNextSection());
5757
0
            }
5758
0
        }
5759
0
        if (checkTransformSkipY)
5760
0
        {
5761
0
            sse_t nonZeroDistY = 0;
5762
0
            uint32_t nonZeroEnergyY = 0;
5763
0
            uint64_t singleCostY = MAX_INT64;
5764
5765
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5766
5767
0
            cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
5768
5769
0
            if (bEnableRDOQ)
5770
0
                m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5771
5772
0
            fenc = fencYuv->getLumaAddr(absPartIdx);
5773
0
            resi = resiYuv.getLumaAddr(absPartIdx);
5774
0
            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
5775
5776
0
            if (numSigTSkipY)
5777
0
            {
5778
0
                m_entropyCoder.resetBits();
5779
0
                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
5780
0
                m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
5781
0
                const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
5782
5783
0
                m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
5784
0
                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5785
5786
0
                bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
5787
0
                primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
5788
0
                nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
5789
5790
0
                if (m_rdCost.m_psyRd)
5791
0
                {
5792
0
                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
5793
0
                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5794
0
                }
5795
0
                else if(m_rdCost.m_ssimRd)
5796
0
                {
5797
0
                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
5798
0
                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5799
0
                }
5800
0
                else
5801
0
                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
5802
0
            }
5803
5804
0
            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
5805
0
                cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5806
0
            else
5807
0
            {
5808
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5809
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5810
0
                cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
5811
0
                bestTransformMode[TEXT_LUMA][0] = 1;
5812
0
                if (m_param->limitTU)
5813
0
                    numSig[TEXT_LUMA][0] = numSigTSkipY;
5814
0
                uint32_t numCoeffY = 1 << (log2TrSize << 1);
5815
0
                memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
5816
0
                primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
5817
0
            }
5818
5819
0
            cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5820
0
        }
5821
5822
0
        if (codeChroma && checkTransformSkipC)
5823
0
        {
5824
0
            sse_t nonZeroDistC = 0;
5825
0
            uint32_t nonZeroEnergyC = 0;
5826
0
            uint64_t singleCostC = MAX_INT64;
5827
0
            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
5828
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5829
5830
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5831
5832
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5833
0
            {
5834
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5835
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5836
5837
0
                do
5838
0
                {
5839
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5840
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5841
5842
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5843
5844
0
                    cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5845
5846
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5847
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5848
5849
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5850
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5851
0
                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
5852
5853
0
                    m_entropyCoder.resetBits();
5854
0
                    singleBits[chromaId][tuIterator.section] = 0;
5855
5856
0
                    if (numSigTSkipC)
5857
0
                    {
5858
0
                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
5859
0
                        m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5860
0
                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
5861
5862
0
                        m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
5863
0
                                                log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
5864
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5865
0
                        bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
5866
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
5867
0
                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
5868
0
                        if (m_rdCost.m_psyRd)
5869
0
                        {
5870
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
5871
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5872
0
                        }
5873
0
                        else if(m_rdCost.m_ssimRd)
5874
0
                        {
5875
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5876
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5877
0
                        }
5878
0
                        else
5879
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
5880
0
                    }
5881
5882
0
                    if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
5883
0
                        cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5884
0
                    else
5885
0
                    {
5886
0
                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5887
0
                        singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5888
0
                        cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
5889
0
                        bestTransformMode[chromaId][tuIterator.section] = 1;
5890
0
                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5891
0
                        memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
5892
0
                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
5893
0
                    }
5894
5895
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5896
0
                }
5897
0
                while (tuIterator.isNextSection());
5898
0
            }
5899
0
        }
5900
5901
        // Here we were encoding cbfs and coefficients, after calculating distortion above.
5902
        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
5903
        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
5904
        // make any difference. But bit confused whether I should load the original context as below.
5905
0
        m_entropyCoder.load(m_rqt[depth].rqtRoot);
5906
0
        m_entropyCoder.resetBits();
5907
5908
        //Encode cbf flags
5909
0
        if (codeChroma)
5910
0
        {
5911
0
            if (!splitIntoSubTUs)
5912
0
            {
5913
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5914
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5915
0
            }
5916
0
            else
5917
0
            {
5918
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5919
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5920
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5921
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
5922
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5923
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
5924
0
            }
5925
0
        }
5926
5927
0
        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
5928
5929
0
        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
5930
5931
0
        uint32_t coeffBits = 0;
5932
0
        coeffBits = singleBits[TEXT_LUMA][0];
5933
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5934
0
        {
5935
0
            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
5936
0
            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
5937
0
        }
5938
5939
        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
5940
        // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
5941
        // four split block's individual cbf value. This is not known before analysis of four split blocks.
5942
        // For that reason, I am collecting individual coefficient bits only.
5943
0
        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
5944
5945
0
        fullCost.distortion += singleDist[TEXT_LUMA][0];
5946
0
        fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
5947
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5948
0
        {
5949
0
            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
5950
0
            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
5951
0
        }
5952
5953
0
        if (m_rdCost.m_psyRd)
5954
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5955
0
        else if(m_rdCost.m_ssimRd)
5956
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5957
0
        else
5958
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
5959
5960
0
        if (m_param->limitTU && bCheckSplit)
5961
0
        {
5962
            // Stop recursion if the TU's energy level is minimal
5963
0
            uint32_t numCoeff = trSize * trSize;
5964
0
            if (cbfFlag[TEXT_LUMA][0] == 0)
5965
0
                bCheckSplit = false;
5966
0
            else if (numSig[TEXT_LUMA][0] < (numCoeff / 64))
5967
0
            {
5968
0
                uint32_t energy = 0;
5969
0
                for (uint32_t i = 0; i < numCoeff; i++)
5970
0
                    energy += abs(coeffCurY[i]);
5971
0
                if (energy == numSig[TEXT_LUMA][0])
5972
0
                    bCheckSplit = false;
5973
0
            }
5974
0
        }
5975
5976
0
        if (bSaveTUData)
5977
0
        {
5978
0
            for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5979
0
            {
5980
0
                for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5981
0
                {
5982
0
                    m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
5983
0
                    m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
5984
0
                }
5985
0
            }
5986
0
            m_cacheTU.cost[idx] = fullCost;
5987
0
            m_entropyCoder.store(m_cacheTU.rqtStore[idx]);
5988
0
        }
5989
0
    }
5990
0
    if (bLoadTUData)
5991
0
    {
5992
0
        for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5993
0
        {
5994
0
            for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5995
0
            {
5996
0
                bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part];
5997
0
                cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part];
5998
0
            }
5999
0
        }
6000
0
        fullCost = m_cacheTU.cost[idx];
6001
0
        m_entropyCoder.load(m_cacheTU.rqtStore[idx]);
6002
0
        bCheckFull = true;
6003
0
    }
6004
6005
    // code sub-blocks
6006
0
    if (bCheckSplit)
6007
0
    {
6008
0
        if (bCheckFull)
6009
0
        {
6010
0
            m_entropyCoder.store(m_rqt[depth].rqtTest);
6011
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
6012
0
        }
6013
6014
0
        Cost splitCost;
6015
0
        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
6016
0
        {
6017
            // Subdiv flag can be encoded at the start of analysis of split blocks.
6018
0
            m_entropyCoder.resetBits();
6019
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
6020
0
            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
6021
0
        }
6022
6023
0
        bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
6024
0
        if (yCbCrCbf || !bCheckFull)
6025
0
        {
6026
0
            if (splitCost.rdcost < fullCost.rdcost)
6027
0
            {
6028
0
                if (m_limitTU & X265_TU_LIMIT_BFS)
6029
0
                {
6030
0
                    uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
6031
0
                    bool nextSplit = nextlog2TrSize > depthRange[0];
6032
0
                    if (nextSplit)
6033
0
                    {
6034
0
                        m_entropyCoder.load(m_rqt[depth].rqtRoot);
6035
0
                        splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
6036
0
                        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
6037
0
                        {
6038
                            // Subdiv flag can be encoded at the start of analysis of split blocks.
6039
0
                            m_entropyCoder.resetBits();
6040
0
                            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
6041
0
                            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
6042
0
                        }
6043
0
                        splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1);
6044
0
                    }
6045
0
                }
6046
0
                outCosts.distortion += splitCost.distortion;
6047
0
                outCosts.rdcost     += splitCost.rdcost;
6048
0
                outCosts.bits       += splitCost.bits;
6049
0
                outCosts.energy     += splitCost.energy;
6050
0
                return;
6051
0
            }
6052
0
            else
6053
0
                outCosts.energy     += splitCost.energy;
6054
0
        }
6055
6056
0
        cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
6057
0
        if (codeChroma)
6058
0
        {
6059
0
            if (!splitIntoSubTUs)
6060
0
            {
6061
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
6062
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
6063
0
            }
6064
0
            else
6065
0
            {
6066
0
                uint32_t tuNumParts = absPartIdxStep >> 1;
6067
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
6068
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
6069
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
6070
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
6071
0
            }
6072
0
        }
6073
0
        X265_CHECK(bCheckFull, "check-full must be set\n");
6074
0
        m_entropyCoder.load(m_rqt[depth].rqtTest);
6075
0
    }
6076
6077
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
6078
0
    cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
6079
6080
0
    if (codeChroma)
6081
0
    {
6082
0
        if (!splitIntoSubTUs)
6083
0
        {
6084
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
6085
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
6086
0
        }
6087
0
        else
6088
0
        {
6089
0
            uint32_t tuNumParts = absPartIdxStep >> 1;
6090
6091
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
6092
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
6093
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
6094
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
6095
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
6096
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
6097
0
        }
6098
0
    }
6099
6100
0
    outCosts.distortion += fullCost.distortion;
6101
0
    outCosts.rdcost     += fullCost.rdcost;
6102
0
    outCosts.bits       += fullCost.bits;
6103
0
    outCosts.energy     += fullCost.energy;
6104
0
}
6105
6106
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
6107
0
{
6108
0
    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
6109
6110
0
    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
6111
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
6112
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
6113
0
    {
6114
0
        if (!(log2TrSize - m_hChromaShift < 2))
6115
0
        {
6116
0
            uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
6117
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
6118
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
6119
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
6120
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
6121
0
        }
6122
0
    }
6123
6124
0
    if (!bSubdiv)
6125
0
    {
6126
0
        m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
6127
0
    }
6128
0
    else
6129
0
    {
6130
0
        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
6131
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
6132
0
            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
6133
0
    }
6134
0
}
6135
6136
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
6137
0
{
6138
0
    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
6139
6140
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
6141
0
    {
6142
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
6143
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
6144
0
            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
6145
0
        return;
6146
0
    }
6147
6148
0
    const uint32_t qtLayer = log2TrSize - 2;
6149
6150
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
6151
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
6152
0
    if (log2TrSizeC < 2)
6153
0
    {
6154
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
6155
0
        log2TrSizeC = 2;
6156
0
        codeChroma &= !(absPartIdx & 3);
6157
0
    }
6158
6159
0
    m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
6160
6161
0
    uint32_t numCoeffY = 1 << (log2TrSize * 2);
6162
0
    uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
6163
0
    coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
6164
0
    coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
6165
0
    memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
6166
6167
0
    if (codeChroma)
6168
0
    {
6169
0
        m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
6170
6171
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
6172
0
        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
6173
6174
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
6175
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
6176
0
        coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
6177
0
        coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
6178
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
6179
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
6180
0
    }
6181
0
}
6182
6183
/* returns the number of bits required to signal a non-most-probable mode.
6184
 * on return mpms contains bitmap of most probable modes */
6185
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
6186
1.41M
{
6187
1.41M
    cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
6188
6189
1.41M
    mpms = 0;
6190
5.67M
    for (int i = 0; i < 3; ++i)
6191
4.25M
        mpms |= ((uint64_t)1 << mpmModes[i]);
6192
6193
1.41M
    return m_entropyCoder.bitsIntraModeNonMPM();
6194
1.41M
}
6195
6196
/* swap the current mode/cost with the mode with the highest cost in the
6197
 * current candidate list, if its cost is better (maintain a top N list) */
6198
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
6199
1.50M
{
6200
1.50M
    uint32_t maxIndex = 0;
6201
1.50M
    uint64_t maxValue = 0;
6202
6203
12.8M
    for (int i = 0; i < maxCandCount; i++)
6204
11.3M
    {
6205
11.3M
        if (maxValue < candCostList[i])
6206
1.62M
        {
6207
1.62M
            maxValue = candCostList[i];
6208
1.62M
            maxIndex = i;
6209
1.62M
        }
6210
11.3M
    }
6211
6212
1.50M
    if (cost < maxValue)
6213
1.43M
    {
6214
1.43M
        candCostList[maxIndex] = cost;
6215
1.43M
        candModeList[maxIndex] = mode;
6216
1.43M
    }
6217
1.50M
}
6218
6219
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
6220
614k
{
6221
614k
    CUData& cu = mode.cu;
6222
614k
    if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
6223
15.6k
    {
6224
15.6k
        if (cu.getQtRootCbf(0))
6225
676
        {
6226
676
            if (m_param->rdLevel >= 3)
6227
676
            {
6228
676
                mode.contexts.resetBits();
6229
676
                mode.contexts.codeDeltaQP(cu, 0);
6230
676
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
6231
676
                mode.totalBits += bits;
6232
676
                updateModeCost(mode);
6233
676
            }
6234
0
            else if (m_param->rdLevel <= 1)
6235
0
            {
6236
0
                mode.sa8dBits++;
6237
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
6238
0
            }
6239
0
            else
6240
0
            {
6241
0
                mode.totalBits++;
6242
0
                updateModeCost(mode);
6243
0
            }
6244
676
        }
6245
14.9k
        else
6246
14.9k
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
6247
15.6k
    }
6248
614k
}
6249
6250
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
6251
91.5k
{
6252
91.5k
    CUData& cu = mode.cu;
6253
6254
91.5k
    if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
6255
18.6k
    {
6256
18.6k
        bool hasResidual = false;
6257
6258
        /* Check if any sub-CU has a non-zero QP */
6259
885k
        for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
6260
866k
        {
6261
866k
            if (cu.getQtRootCbf(blkIdx))
6262
462
            {
6263
462
                hasResidual = true;
6264
462
                break;
6265
462
            }
6266
866k
        }
6267
18.6k
        if (hasResidual)
6268
462
        {
6269
462
            if (m_param->rdLevel >= 3)
6270
462
            {
6271
462
                mode.contexts.resetBits();
6272
462
                mode.contexts.codeDeltaQP(cu, 0);
6273
462
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
6274
462
                mode.totalBits += bits;
6275
462
                updateModeCost(mode);
6276
462
            }
6277
0
            else if (m_param->rdLevel <= 1)
6278
0
            {
6279
0
                mode.sa8dBits++;
6280
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
6281
0
            }
6282
0
            else
6283
0
            {
6284
0
                mode.totalBits++;
6285
0
                updateModeCost(mode);
6286
0
            }
6287
            /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
6288
            When the non-zero CBF sub-CU is found, stop */
6289
462
            cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
6290
462
        }
6291
18.1k
        else
6292
            /* No residual within this CU or subCU, so reset QP to RefQP */
6293
18.1k
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
6294
18.6k
    }
6295
91.5k
}