Coverage Report

Created: 2026-05-30 06:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/search.cpp
Line
Count
Source
1
/*****************************************************************************
2
* Copyright (C) 2013-2020 MulticoreWare, Inc
3
*
4
* Authors: Steve Borho <steve@borho.org>
5
*          Min Chen <chenm003@163.com>
6
*
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 2 of the License, or
10
* (at your option) any later version.
11
*
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
* GNU General Public License for more details.
16
*
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
*
21
* This program is also available under a commercial proprietary license.
22
* For more information, contact us at license @ x265.com.
23
*****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "picyuv.h"
28
#include "cudata.h"
29
30
#include "search.h"
31
#include "entropy.h"
32
#include "rdcost.h"
33
34
#include "analysis.h"  // TLD
35
#include "framedata.h"
36
#include "encoder.h"
37
38
using namespace X265_NS;
39
40
#if _MSC_VER
41
#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
42
#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
43
#pragma warning(disable: 4127) // conditional expression is constant
44
#endif
45
46
0
#define MVP_IDX_BITS 1
47
48
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
49
50
Search::Search()
51
0
{
52
0
    memset(m_rqt, 0, sizeof(m_rqt));
53
54
0
    for (int i = 0; i < 3; i++)
55
0
    {
56
0
        m_qtTempTransformSkipFlag[i] = NULL;
57
0
        m_qtTempCbf[i] = NULL;
58
0
    }
59
60
0
    m_numLayers = 0;
61
0
    m_intraPred = NULL;
62
0
    m_intraPredAngs = NULL;
63
0
    m_fencScaled = NULL;
64
0
    m_fencTransposed = NULL;
65
0
    m_tsCoeff = NULL;
66
0
    m_tsResidual = NULL;
67
0
    m_tsRecon = NULL;
68
0
    m_param = NULL;
69
0
    m_slice = NULL;
70
0
    m_frame = NULL;
71
0
    m_maxTUDepth = -1;
72
0
}
73
74
bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
75
0
{
76
0
    uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
77
0
    m_param = &param;
78
0
    m_bFrameParallel = param.frameNumThreads > 1;
79
0
    m_numLayers = g_log2Size[param.maxCUSize] - 2;
80
#if ENABLE_SCC_EXT
81
    m_ibcEnabled = param.bEnableSCC;
82
#endif
83
84
0
    m_rdCost.setPsyRdScale(param.psyRd);
85
0
    m_rdCost.setSsimRd(param.bSsimRd);
86
0
    m_me.init(param.internalCsp);
87
88
0
    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
89
0
    if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
90
0
        ok &= m_quant.allocNoiseReduction(param);
91
92
0
    ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
93
94
    /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
95
     * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
96
0
    m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
97
98
0
    uint32_t sizeL = 1 << (maxLog2CUSize * 2);
99
0
    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
100
0
    uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
101
102
0
    m_limitTU = 0;
103
0
    if (m_param->limitTU)
104
0
    {
105
0
        if (m_param->limitTU == 1)
106
0
            m_limitTU = X265_TU_LIMIT_BFS;
107
0
        else if (m_param->limitTU == 2)
108
0
            m_limitTU = X265_TU_LIMIT_DFS;
109
0
        else if (m_param->limitTU == 3)
110
0
            m_limitTU = X265_TU_LIMIT_NEIGH;
111
0
        else if (m_param->limitTU == 4)
112
0
            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
113
0
    }
114
115
    /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
116
     * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
117
     * which are reconstructed at each depth are valid. At the end, the transform depth table
118
     * is walked and the coeff and recon at the correct depths are collected */
119
120
0
    if (param.internalCsp != X265_CSP_I400)
121
0
    {
122
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
123
0
        {
124
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
125
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
126
0
            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
127
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
128
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
129
0
        }
130
0
    }
131
0
    else
132
0
    {
133
0
        for (uint32_t i = 0; i <= m_numLayers; i++)
134
0
        {
135
0
            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
136
0
            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
137
0
            ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp);
138
0
            ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp);
139
0
        }
140
0
    }
141
142
    /* the rest of these buffers are indexed per-depth */
143
0
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
144
0
    {
145
0
        int cuSize = param.maxCUSize >> i;
146
0
        ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
147
0
        ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
148
0
        ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
149
0
        ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
150
0
    }
151
152
0
    if (param.internalCsp != X265_CSP_I400)
153
0
    {
154
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
155
0
        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
156
0
        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
157
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
158
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
159
0
        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
160
0
    }
161
0
    else
162
0
    {
163
0
        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
164
0
        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
165
0
        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
166
0
        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
167
0
    }
168
169
0
    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
170
0
    m_fencScaled = m_intraPred + 32 * 32;
171
0
    m_fencTransposed = m_fencScaled + 32 * 32;
172
0
    m_intraPredAngs = m_fencTransposed + 32 * 32;
173
174
0
    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
175
0
    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
176
0
    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
177
178
#if ENABLE_SCC_EXT
179
    m_numBVs = 0;
180
    m_numBV16s = 0;
181
#endif
182
183
0
    return ok;
184
185
0
fail:
186
0
    return false;
187
0
}
188
189
Search::~Search()
190
0
{
191
0
    for (uint32_t i = 0; i <= m_numLayers; i++)
192
0
    {
193
0
        X265_FREE(m_rqt[i].coeffRQT[0]);
194
0
        m_rqt[i].reconQtYuv.destroy();
195
0
        m_rqt[i].resiQtYuv.destroy();
196
0
    }
197
198
0
    for (uint32_t i = 0; i <= m_param->maxCUDepth; i++)
199
0
    {
200
0
        m_rqt[i].tmpResiYuv.destroy();
201
0
        m_rqt[i].tmpPredYuv.destroy();
202
0
        m_rqt[i].bidirPredYuv[0].destroy();
203
0
        m_rqt[i].bidirPredYuv[1].destroy();
204
0
    }
205
206
0
    X265_FREE(m_qtTempCbf[0]);
207
0
    X265_FREE(m_qtTempTransformSkipFlag[0]);
208
0
    X265_FREE(m_intraPred);
209
0
    X265_FREE(m_tsCoeff);
210
0
    X265_FREE(m_tsResidual);
211
0
    X265_FREE(m_tsRecon);
212
0
}
213
214
int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
215
0
{
216
0
    X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
217
218
0
    m_me.setQP(qp);
219
0
    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
220
221
0
    int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
222
0
    m_quant.setQPforQuant(ctu, quantQP);
223
0
    return quantQP;
224
0
}
225
226
void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData& cu, PicYuv* fencPic, int puOffset, PartSize part, int areaIdx, int finalIdx, bool isMVP , const int* neighborIdx)
227
0
{
228
#ifdef DETAILED_CU_STATS
229
    m_stats[cu.m_encData->m_frameEncoderID].countMotionEstimate++;
230
#endif
231
232
0
    int satdCost = 0;
233
0
    int numPredDir = slice->isInterP() ? 1 : 2;
234
0
    int searchRange = isMVP ? 32 : m_param->searchRange;
235
236
0
    MV mvp(0,0);
237
0
    MV mvzero(0,0);
238
239
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
240
0
    MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
241
242
0
    MotionData bestME[2];
243
0
    bestME[0].cost = MAX_UINT;
244
0
    bestME[1].cost = MAX_UINT;
245
246
0
    int numPart = cu.getNumPartInter(0);
247
0
    uint32_t lastMode = 0;
248
249
0
    int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
250
0
    int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
251
0
    int slotIdx = row * m_slice->m_sps->numCuInWidth + col;
252
253
0
    int numMvc = 0;
254
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
255
0
    {
256
0
        PredictionUnit pu(cu, cuGeom, puIdx);
257
258
0
        int pos = finalIdx + puIdx * puOffset;
259
260
0
        InterNeighbourMV neighbours[6];
261
0
        if(!isMVP)
262
0
           cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, neighbours);
263
264
0
        for (int list = 0; list < numPredDir; list++)
265
0
        {
266
0
            int numIdx = slice->m_numRefIdx[list];
267
0
            for (int ref = 0; ref < numIdx; ref++)
268
0
            {
269
0
                getBlkBits(part, slice->isInterP(), puIdx, lastMode, m_listSelBits);
270
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
271
0
                bits += getTUBits(ref, numIdx);
272
273
0
                MV mvmin, mvmax, outmv,mvp_lowres;;
274
0
                mvp = !isMVP ? m_areaBestMV[areaIdx][list][ref] : mvp;
275
276
0
                MV zeroMV[2] = {0,0};
277
0
                const MV* amvp = zeroMV;
278
0
                int mvpIdx = 0;
279
280
0
                PicYuv* recon = slice->m_mref[list][ref].reconPic;
281
0
                int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0);
282
0
                m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
283
284
0
                bool bLowresMVP = false;
285
0
                if (!isMVP)
286
0
                {
287
0
                    for(int dir = MD_LEFT; dir <= MD_ABOVE_LEFT ; dir++)
288
0
                    {
289
0
                        int neighIdx = neighborIdx[dir];
290
0
                        if (neighIdx >= 0)
291
0
                        {
292
0
                            MEData& neighborData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + neighIdx];
293
0
                            for (int i = 0; i < 2; i++)
294
0
                            {
295
0
                                neighbours[dir].mv[i] = neighborData.mv[i];
296
0
                                neighbours[dir].refIdx[i] = neighborData.ref[i];
297
0
                            }
298
0
                            neighbours[dir].isAvailable = (neighborData.ref[0] >= 0 || neighborData.ref[1] >= 0);
299
0
                        }
300
0
                        else
301
0
                        {
302
0
                            for (int i = 0; i < 2; i++)
303
0
                                neighbours[dir].refIdx[i] = -1;
304
0
                            neighbours[dir].isAvailable = false;
305
0
                        }
306
0
                    }
307
308
0
                    numMvc = cu.getPMV(neighbours, list, ref, amvpCand[list][ref], mvc);
309
0
                    if (numMvc > 0)
310
0
                    {
311
0
                        amvp = amvpCand[list][ref];
312
0
                        mvpIdx = selectMVP(cu, pu, amvp, list, ref);
313
0
                        mvp = amvp[mvpIdx];                 
314
0
                    }
315
0
                    else if (slice->m_refFrameList[list][ref]->m_encData->m_slice->m_sliceType != I_SLICE)
316
0
                    {
317
0
                        MEData meData = slice->m_refFrameList[list][ref]->m_encData->m_slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
318
319
0
                        bool bi = (meData.ref[0] >= 0 && meData.ref[1] >= 0);
320
0
                        bool uniL0 = (meData.ref[0] >= 0 && meData.ref[1] == REF_NOT_VALID);
321
0
                        bool uniL1 = (meData.ref[1] >= 0 && meData.ref[0] == REF_NOT_VALID);
322
323
0
                        if (uniL0)
324
0
                            mvp = meData.mv[0];
325
0
                        else if (uniL1)
326
0
                            mvp = meData.mv[1];
327
0
                        else if (bi)
328
0
                            mvp = meData.mv[list];
329
0
                    }
330
0
                }
331
332
0
                m_me.setMVP(mvp);
333
334
0
                if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad))
335
0
                {
336
0
                    uint32_t blockX = cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + (pu.width  >> 1);
337
0
                    uint32_t blockY = cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + (pu.height >> 1);
338
339
0
                    if (blockX < m_slice->m_sps->picWidthInLumaSamples && blockY < m_slice->m_sps->picHeightInLumaSamples)
340
0
                    {
341
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
342
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
343
0
                        if (lmv.notZero() && !layer)
344
0
                        {
345
0
                            mvc[numMvc++] = lmv;
346
0
                            bLowresMVP = true;
347
0
                        }
348
0
                        mvp_lowres = lmv;
349
0
                    }
350
0
                }
351
352
0
                if (m_param->searchMethod == X265_SEA)
353
0
                {
354
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
355
0
                        m_me.integral[planes] = slice->m_refFrameList[list][ref]->m_encData->m_meIntegral[planes] + offset;
356
0
                }
357
358
0
                setSearchRange(cu, mvp, searchRange, mvmin, mvmax);
359
360
0
                if (isMVP)
361
0
                {
362
0
                    satdCost = m_me.diamondSearch(&slice->m_mref[list][ref], mvmin, mvmax, outmv);
363
0
                    m_areaBestMV[areaIdx][list][ref] = outmv;
364
0
                }
365
0
                else
366
0
                {
367
0
                    m_vertRestriction = slice->m_refPOCList[list][ref] == slice->m_poc;
368
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
369
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
370
371
0
                    if (bLowresMVP && mvp_lowres.notZero() && mvp_lowres != mvp)
372
0
                    {
373
0
                        MV outmv_lowres;
374
0
                        bLowresMVP = false;
375
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
376
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref],  mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange,outmv_lowres, m_param->maxSlices,
377
0
                            m_vertRestriction, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0): 0);
378
379
0
                        if (lowresMvCost < satdCost)
380
0
                        {
381
0
                            outmv = outmv_lowres;
382
0
                            satdCost = lowresMvCost;
383
0
                            bLowresMVP = true;
384
0
                        }
385
0
                    }
386
0
                }
387
388
0
                bits += m_me.bitcost(outmv);
389
0
                uint32_t mvCost = m_me.mvcost(outmv);
390
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
391
392
0
                if(!isMVP)
393
0
                {
394
0
                    if (bLowresMVP)
395
0
                        updateMVP(mvp, outmv, bits, cost, mvp_lowres);
396
397
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
398
0
                }
399
0
                if (cost < bestME[list].cost)
400
0
                {
401
0
                    bestME[list].mv = outmv;
402
0
                    bestME[list].mvp = mvp;
403
0
                    bestME[list].mvpIdx = 0;
404
0
                    bestME[list].cost = cost;
405
0
                    bestME[list].bits = bits;
406
0
                    bestME[list].mvCost = mvCost;
407
0
                    bestME[list].ref = ref;
408
0
                }
409
0
            }
410
0
        }
411
412
0
        if (isMVP)
413
0
            return;
414
415
        //Bi-Direction
416
0
        MotionData bidir[2];
417
0
        uint32_t bidirCost = MAX_UINT;
418
0
        int bidirBits = 0;
419
0
        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
420
421
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&
422
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && !isMVP)
423
0
        {
424
0
            bidir[0] = bestME[0];
425
0
            bidir[1] = bestME[1];
426
427
0
            if (m_me.bChromaSATD)
428
0
            {
429
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
430
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
431
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
432
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
433
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
434
435
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
436
0
                    m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
437
0
            }
438
0
            else
439
0
            {
440
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
441
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
442
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
443
444
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
445
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
446
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
447
0
                    bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
448
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
449
0
            }
450
451
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
452
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
453
454
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
455
0
            if (bTryZero)
456
0
            {
457
0
                MV mvmin, mvmax;
458
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
459
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
460
0
                mvmax.y += 2;
461
0
                mvmin <<= 2;
462
0
                mvmax <<= 2;
463
464
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
465
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
466
0
            }
467
0
            if (bTryZero)
468
0
            {
469
0
                if (m_me.bChromaSATD)
470
0
                {
471
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
472
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
473
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
474
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
475
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
476
477
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
478
0
                        m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
479
0
                }
480
0
                else
481
0
                {
482
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
483
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
484
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
485
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
486
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
487
0
                }
488
489
0
                MV mvp0 = bestME[0].mvp;
490
0
                int mvpIdx0 = bestME[0].mvpIdx;
491
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
492
493
0
                MV mvp1 = bestME[1].mvp;
494
0
                int mvpIdx1 = bestME[1].mvpIdx;
495
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
496
497
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
498
499
0
                if (cost < bidirCost)
500
0
                {
501
0
                    bidir[0].mv = mvzero;
502
0
                    bidir[1].mv = mvzero;
503
0
                    bidir[0].mvp = mvp0;
504
0
                    bidir[1].mvp = mvp1;
505
0
                    bidir[0].mvpIdx = mvpIdx0;
506
0
                    bidir[1].mvpIdx = mvpIdx1;
507
0
                    bidirCost = cost;
508
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
509
0
                }
510
0
            }
511
0
        }
512
0
        MEData& outME = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + pos];
513
514
0
        outME.ref[0] = REF_NOT_VALID;
515
0
        outME.ref[1] = REF_NOT_VALID;
516
517
0
        if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
518
0
        {
519
0
            lastMode = 2;
520
521
0
            outME.mv[0] = bidir[0].mv;
522
0
            outME.mv[1] = bidir[1].mv;
523
0
            outME.mvp[0] = bidir[0].mvp;
524
0
            outME.mvp[1] = bidir[1].mvp;
525
0
            outME.mvCost[0] = bestME[0].mvCost;
526
0
            outME.mvCost[1] = bestME[1].mvCost;
527
0
            outME.ref[0] = bestME[0].ref;
528
0
            outME.ref[1] = bestME[1].ref;
529
530
0
            outME.bits = bidirBits;
531
0
            outME.cost = bidirCost;
532
0
        }
533
0
        else if (bestME[0].cost <= bestME[1].cost)
534
0
        {
535
0
            lastMode = 0;
536
537
0
            outME.mv[0] = bestME[0].mv;
538
0
            outME.mvp[0] = bestME[0].mvp;
539
0
            outME.mvCost[0] = bestME[0].mvCost;
540
0
            outME.cost = bestME[0].cost;
541
0
            outME.bits = bestME[0].bits;
542
0
            outME.ref[0] = bestME[0].ref;
543
0
            outME.ref[1] = REF_NOT_VALID;
544
0
        }
545
0
        else
546
0
        {
547
0
            lastMode = 1;
548
549
0
            outME.mv[1] = bestME[1].mv;
550
0
            outME.mvp[1] = bestME[1].mvp;
551
0
            outME.mvCost[1] = bestME[1].mvCost;
552
0
            outME.cost = bestME[1].cost;
553
0
            outME.bits = bestME[1].bits;
554
0
            outME.ref[1] = bestME[1].ref;
555
0
            outME.ref[0] = REF_NOT_VALID;
556
0
        }
557
0
    }
558
0
}
559
560
#if CHECKED_BUILD || _DEBUG
561
void Search::invalidateContexts(int fromDepth)
562
{
563
    /* catch reads without previous writes */
564
    for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
565
    {
566
        m_rqt[d].cur.markInvalid();
567
        m_rqt[d].rqtTemp.markInvalid();
568
        m_rqt[d].rqtRoot.markInvalid();
569
        m_rqt[d].rqtTest.markInvalid();
570
    }
571
}
572
#else
573
0
void Search::invalidateContexts(int) {}
574
#endif
575
576
void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
577
0
{
578
0
    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
579
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
580
581
0
    if (!(log2TrSize - m_hChromaShift < 2))
582
0
    {
583
0
        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
584
0
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
585
0
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
586
0
        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
587
0
            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
588
0
    }
589
590
0
    if (subdiv)
591
0
    {
592
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
593
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
594
0
            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
595
0
    }
596
0
}
597
598
void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
599
0
{
600
0
    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
601
0
        return;
602
603
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
604
605
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
606
0
    {
607
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
608
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
609
0
            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
610
611
0
        return;
612
0
    }
613
614
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
615
616
0
    if (log2TrSizeC < 2)
617
0
    {
618
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
619
0
        if (absPartIdx & 3)
620
0
            return;
621
0
        log2TrSizeC = 2;
622
0
    }
623
624
0
    uint32_t qtLayer = log2TrSize - 2;
625
626
0
    if (m_csp != X265_CSP_I422)
627
0
    {
628
0
        uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
629
0
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
630
0
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
631
0
        m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
632
0
    }
633
0
    else
634
0
    {
635
0
        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
636
0
        coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
637
0
        uint32_t subTUSize = 1 << (log2TrSizeC * 2);
638
0
        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
639
0
        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
640
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
641
0
        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
642
0
            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
643
0
    }
644
0
}
645
646
void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
647
0
{
648
0
    CUData& cu = mode.cu;
649
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
650
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
651
0
    uint32_t qtLayer    = log2TrSize - 2;
652
0
    uint32_t sizeIdx    = log2TrSize - 2;
653
0
    bool mightNotSplit  = log2TrSize <= depthRange[1];
654
0
    bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
655
0
    bool bEnableRDOQ  = !!m_param->rdoqLevel;
656
657
    /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
658
0
    if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
659
0
    {
660
0
        mightNotSplit = false;
661
0
        mightSplit = true;
662
0
    }
663
664
0
    Cost fullCost;
665
0
    uint32_t bCBF = 0;
666
667
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
668
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
669
670
0
    if (mightNotSplit)
671
0
    {
672
0
        if (mightSplit)
673
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
674
675
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
676
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
677
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
678
0
        uint32_t stride   = mode.fencYuv->m_size;
679
680
        // init availability pattern
681
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
682
0
        IntraNeighbors intraNeighbors;
683
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
684
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
685
686
        // get prediction signal
687
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
688
689
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
690
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
691
692
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
693
0
        coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
694
695
        // store original entropy coding status
696
0
        if (bEnableRDOQ)
697
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
698
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
699
700
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
701
0
        if (numSig)
702
0
        {
703
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
704
0
            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
705
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
706
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
707
0
            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
708
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
709
0
        }
710
0
        else
711
            // no coded residual, recon = pred
712
0
            primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
713
714
0
        bCBF = !!numSig << tuDepth;
715
0
        cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
716
0
        fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
717
718
0
        m_entropyCoder.resetBits();
719
0
        if (!absPartIdx)
720
0
        {
721
0
            if (!cu.m_slice->isIntra())
722
0
            {
723
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
724
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
725
0
                m_entropyCoder.codeSkipFlag(cu, 0);
726
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
727
0
            }
728
729
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
730
0
        }
731
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
732
0
        {
733
0
            if (!absPartIdx)
734
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
735
0
        }
736
0
        else
737
0
        {
738
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
739
0
            if (!tuDepth)
740
0
            {
741
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
742
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
743
0
            }
744
0
            else if (!(absPartIdx & (qNumParts - 1)))
745
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
746
0
        }
747
0
        if (log2TrSize != depthRange[0])
748
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
749
750
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
751
752
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
753
0
            m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
754
755
0
        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
756
757
0
        if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
758
0
            fullCost.bits *= 4;
759
760
0
        if (m_rdCost.m_psyRd)
761
0
        {
762
0
            fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
763
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
764
0
        }
765
0
        else if(m_rdCost.m_ssimRd)
766
0
        {
767
0
            fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
768
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
769
0
        }
770
0
        else
771
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
772
0
    }
773
0
    else
774
0
        fullCost.rdcost = MAX_INT64;
775
776
0
    if (mightSplit)
777
0
    {
778
0
        if (mightNotSplit)
779
0
        {
780
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
781
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
782
0
        }
783
784
        /* code split block */
785
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
786
787
0
        int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
788
0
        if (m_param->bEnableTSkipFast)
789
0
            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
790
791
0
        Cost splitCost;
792
0
        uint32_t cbf = 0;
793
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
794
0
        {
795
0
            if (checkTransformSkip)
796
0
                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
797
0
            else
798
0
                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
799
800
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
801
0
        }
802
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
803
804
0
        if (mightNotSplit && log2TrSize != depthRange[0])
805
0
        {
806
            /* If we could have coded this TU depth, include cost of subdiv flag */
807
0
            m_entropyCoder.resetBits();
808
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
809
0
            splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
810
811
0
            if (m_rdCost.m_psyRd)
812
0
                splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
813
0
            else if(m_rdCost.m_ssimRd)
814
0
                splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
815
0
            else
816
0
                splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
817
0
        }
818
819
0
        if (splitCost.rdcost < fullCost.rdcost)
820
0
        {
821
0
            outCost.rdcost     += splitCost.rdcost;
822
0
            outCost.distortion += splitCost.distortion;
823
0
            outCost.bits       += splitCost.bits;
824
0
            outCost.energy     += splitCost.energy;
825
0
            return;
826
0
        }
827
0
        else
828
0
        {
829
            // recover entropy state of full-size TU encode
830
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
831
832
            // recover transform index and Cbf values
833
0
            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
834
0
            cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
835
0
            cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
836
0
        }
837
0
    }
838
839
    // set reconstruction for next intra prediction blocks if full TU prediction won
840
0
    PicYuv*  reconPic = m_frame->m_reconPic[0];
841
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
842
0
    intptr_t picStride = reconPic->m_stride;
843
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
844
845
0
    outCost.rdcost     += fullCost.rdcost;
846
0
    outCost.distortion += fullCost.distortion;
847
0
    outCost.bits       += fullCost.bits;
848
0
    outCost.energy     += fullCost.energy;
849
0
}
850
851
void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
852
0
{
853
0
    uint32_t fullDepth = cuGeom.depth + tuDepth;
854
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
855
0
    uint32_t tuSize = 1 << log2TrSize;
856
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
857
858
0
    X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
859
860
0
    CUData& cu = mode.cu;
861
0
    Yuv* predYuv = &mode.predYuv;
862
0
    const Yuv* fencYuv = mode.fencYuv;
863
864
0
    Cost fullCost;
865
0
    fullCost.rdcost = MAX_INT64;
866
0
    int      bTSkip = 0;
867
0
    uint32_t bCBF = 0;
868
869
0
    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
870
0
    pixel*   pred = predYuv->getLumaAddr(absPartIdx);
871
0
    int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
872
0
    uint32_t stride = fencYuv->m_size;
873
0
    uint32_t sizeIdx = log2TrSize - 2;
874
875
    // init availability pattern
876
0
    uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
877
0
    IntraNeighbors intraNeighbors;
878
0
    initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
879
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
880
881
    // get prediction signal
882
0
    predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
883
884
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
885
886
0
    uint32_t qtLayer = log2TrSize - 2;
887
0
    uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
888
0
    coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
889
0
    pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
890
0
    uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
891
892
    // store original entropy coding status
893
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
894
895
0
    if (bEnableRDOQ)
896
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
897
898
0
    int checkTransformSkip = 1;
899
0
    for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
900
0
    {
901
0
        uint64_t tmpCost;
902
0
        uint32_t tmpEnergy = 0;
903
904
0
        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
905
0
        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
906
0
        bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
907
0
        uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
908
909
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
910
911
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
912
0
        if (numSig)
913
0
        {
914
0
            m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
915
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
916
0
            bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
917
0
            bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
918
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
919
0
        }
920
0
        else if (useTSkip)
921
0
        {
922
            /* do not allow tskip if CBF=0, pretend we did not try tskip */
923
0
            checkTransformSkip = 0;
924
0
            break;
925
0
        }
926
0
        else
927
            // no residual coded, recon = pred
928
0
            primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
929
930
0
        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
931
932
0
        cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
933
0
        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
934
935
0
        if (useTSkip)
936
0
            m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
937
938
0
        m_entropyCoder.resetBits();
939
0
        if (!absPartIdx)
940
0
        {
941
0
            if (!cu.m_slice->isIntra())
942
0
            {
943
0
                if (cu.m_slice->m_pps->bTransquantBypassEnabled)
944
0
                    m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
945
0
                m_entropyCoder.codeSkipFlag(cu, 0);
946
0
                m_entropyCoder.codePredMode(cu.m_predMode[0]);
947
0
            }
948
949
0
            m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
950
0
        }
951
0
        if (cu.m_partSize[0] == SIZE_2Nx2N)
952
0
        {
953
0
            if (!absPartIdx)
954
0
                m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
955
0
        }
956
0
        else
957
0
        {
958
0
            uint32_t qNumParts = cuGeom.numPartitions >> 2;
959
0
            if (!tuDepth)
960
0
            {
961
0
                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
962
0
                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
963
0
            }
964
0
            else if (!(absPartIdx & (qNumParts - 1)))
965
0
                m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
966
0
        }
967
0
        m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
968
969
0
        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
970
971
0
        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
972
0
            m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
973
974
0
        uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
975
976
0
        if (!useTSkip)
977
0
            m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
978
979
0
        if (m_rdCost.m_psyRd)
980
0
        {
981
0
            tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
982
0
            tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
983
0
        }
984
0
        else if(m_rdCost.m_ssimRd)
985
0
        {
986
0
            tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx);
987
0
            tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
988
0
        }
989
0
        else
990
0
            tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
991
992
0
        if (tmpCost < fullCost.rdcost)
993
0
        {
994
0
            bTSkip = useTSkip;
995
0
            bCBF = !!numSig;
996
0
            fullCost.rdcost = tmpCost;
997
0
            fullCost.distortion = tmpDist;
998
0
            fullCost.bits = tmpBits;
999
0
            fullCost.energy = tmpEnergy;
1000
0
        }
1001
0
    }
1002
1003
0
    if (bTSkip)
1004
0
    {
1005
0
        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
1006
0
        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
1007
0
    }
1008
0
    else if (checkTransformSkip)
1009
0
    {
1010
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
1011
0
        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
1012
0
        m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
1013
0
    }
1014
1015
    // set reconstruction for next intra prediction blocks
1016
0
    PicYuv*  reconPic = m_frame->m_reconPic[0];
1017
0
    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1018
0
    intptr_t picStride = reconPic->m_stride;
1019
0
    primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
1020
1021
0
    outCost.rdcost += fullCost.rdcost;
1022
0
    outCost.distortion += fullCost.distortion;
1023
0
    outCost.bits += fullCost.bits;
1024
0
    outCost.energy += fullCost.energy;
1025
0
}
1026
1027
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
1028
void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
1029
0
{
1030
0
    CUData& cu = mode.cu;
1031
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
1032
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1033
0
    bool     bCheckFull = log2TrSize <= depthRange[1];
1034
1035
0
    X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
1036
1037
    /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
1038
     * since we are not measuring RD cost */
1039
0
    if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
1040
0
        bCheckFull = false;
1041
1042
0
    if (bCheckFull)
1043
0
    {
1044
0
        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
1045
0
        pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
1046
0
        int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
1047
0
        uint32_t stride   = mode.fencYuv->m_size;
1048
1049
        // init availability pattern
1050
0
        uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
1051
0
        IntraNeighbors intraNeighbors;
1052
0
        initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
1053
0
        initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
1054
1055
        // get prediction signal
1056
0
        predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
1057
1058
0
        X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
1059
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
1060
1061
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
1062
0
        coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
1063
1064
0
        uint32_t sizeIdx   = log2TrSize - 2;
1065
0
        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1066
1067
0
        PicYuv*  reconPic = m_frame->m_reconPic[0];
1068
0
        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
1069
0
        intptr_t picStride = reconPic->m_stride;
1070
1071
0
        uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
1072
0
        if (numSig)
1073
0
        {
1074
0
            m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
1075
0
            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
1076
0
            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
1077
0
            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
1078
0
            bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
1079
0
            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
1080
0
            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
1081
0
        }
1082
0
        else
1083
0
        {
1084
0
            primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride);
1085
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
1086
0
        }
1087
0
    }
1088
0
    else
1089
0
    {
1090
0
        X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
1091
1092
        /* code split block */
1093
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1094
0
        uint32_t cbf = 0;
1095
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1096
0
        {
1097
0
            residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
1098
0
            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
1099
0
        }
1100
0
        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
1101
0
    }
1102
0
}
1103
1104
void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
1105
0
{
1106
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1107
1108
0
    if (tuDepth == cu.m_tuDepth[absPartIdx])
1109
0
    {
1110
0
        uint32_t qtLayer    = log2TrSize - 2;
1111
1112
        // copy transform coefficients
1113
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
1114
0
        coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
1115
0
        coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
1116
0
        memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
1117
1118
        // copy reconstruction
1119
0
        m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
1120
0
    }
1121
0
    else
1122
0
    {
1123
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1124
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1125
0
            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
1126
0
    }
1127
0
}
1128
1129
inline void offsetCBFs(uint8_t subTUCBF[2])
1130
0
{
1131
0
    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
1132
0
    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
1133
0
    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
1134
0
}
1135
1136
/* 4:2:2 post-TU split processing */
1137
void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
1138
0
{
1139
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1140
1141
0
    if (log2TrSize == 2)
1142
0
    {
1143
0
        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1144
0
        ++log2TrSize;
1145
0
    }
1146
1147
0
    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
1148
1149
    // move the CBFs down a level and set the parent CBF
1150
0
    uint8_t subTUCBF[2];
1151
0
    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
1152
0
    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
1153
0
    offsetCBFs(subTUCBF);
1154
1155
0
    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
1156
0
    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
1157
0
}
1158
1159
/* returns distortion */
1160
void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
1161
0
{
1162
0
    CUData& cu = mode.cu;
1163
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1164
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
1165
1166
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1167
0
    {
1168
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1169
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
1170
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1171
0
        {
1172
0
            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
1173
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1174
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1175
0
        }
1176
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1177
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1178
1179
0
        return;
1180
0
    }
1181
1182
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1183
0
    uint32_t tuDepthC = tuDepth;
1184
0
    if (log2TrSizeC < 2)
1185
0
    {
1186
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1187
0
        if (absPartIdx & 3)
1188
0
            return;
1189
0
        log2TrSizeC = 2;
1190
0
        tuDepthC--;
1191
0
    }
1192
1193
0
    if (bEnableRDOQ)
1194
0
        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
1195
1196
0
    bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
1197
0
    checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
1198
0
    if (checkTransformSkip)
1199
0
    {
1200
0
        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
1201
0
        return;
1202
0
    }
1203
1204
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1205
0
    uint32_t qtLayer = log2TrSize - 2;
1206
0
    uint32_t stride = mode.fencYuv->m_csize;
1207
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1208
1209
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1210
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1211
1212
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1213
0
    do
1214
0
    {
1215
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1216
1217
0
        IntraNeighbors intraNeighbors;
1218
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1219
1220
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1221
0
        {
1222
0
            TextType ttype = (TextType)chromaId;
1223
1224
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1225
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1226
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1227
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1228
0
            coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1229
0
            pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1230
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
1231
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1232
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1233
0
            intptr_t picStride = reconPic->m_strideC;
1234
1235
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1236
0
            if (chromaPredMode == DM_CHROMA_IDX)
1237
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1238
0
            if (m_csp == X265_CSP_I422)
1239
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1240
1241
            // init availability pattern
1242
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1243
1244
            // get prediction signal
1245
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1246
0
            cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1247
1248
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1249
1250
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1251
0
            if (numSig)
1252
0
            {
1253
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1254
0
                bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1255
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1256
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1257
0
                bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
1258
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
1259
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1260
0
            }
1261
0
            else
1262
0
            {
1263
                // no coded residual, recon = pred
1264
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride);
1265
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1266
0
            }
1267
1268
0
            outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
1269
1270
0
            if (m_rdCost.m_psyRd)
1271
0
                outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1272
0
            else if(m_rdCost.m_ssimRd)
1273
0
                outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1274
1275
0
            primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
1276
0
        }
1277
0
    }
1278
0
    while (tuIterator.isNextSection());
1279
1280
0
    if (splitType == VERTICAL_SPLIT)
1281
0
    {
1282
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1283
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1284
0
    }
1285
0
}
1286
1287
/* returns distortion */
1288
void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
1289
0
{
1290
0
    CUData& cu = mode.cu;
1291
0
    uint32_t fullDepth  = cuGeom.depth + tuDepth;
1292
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
1293
0
    const uint32_t log2TrSizeC = 2;
1294
0
    uint32_t qtLayer = log2TrSize - 2;
1295
1296
    /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
1297
     * so the entropy coder is not very accurate. The best we can do is return it in the same
1298
     * condition as it arrived, and to do all bit estimates from the same state. */
1299
0
    m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
1300
1301
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1302
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1303
1304
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1305
0
    do
1306
0
    {
1307
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1308
1309
0
        IntraNeighbors intraNeighbors;
1310
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1311
1312
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1313
0
        {
1314
0
            TextType ttype = (TextType)chromaId;
1315
1316
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1317
0
            pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1318
0
            int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
1319
0
            uint32_t stride = mode.fencYuv->m_csize;
1320
0
            const uint32_t sizeIdxC = log2TrSizeC - 2;
1321
1322
0
            uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1323
0
            coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1324
0
            pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1325
0
            uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
1326
1327
            // init availability pattern
1328
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1329
1330
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1331
0
            if (chromaPredMode == DM_CHROMA_IDX)
1332
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1333
0
            if (m_csp == X265_CSP_I422)
1334
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1335
1336
            // get prediction signal
1337
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1338
1339
0
            uint64_t bCost = MAX_INT64;
1340
0
            sse_t bDist = 0;
1341
0
            uint32_t bCbf = 0;
1342
0
            uint32_t bEnergy = 0;
1343
0
            int      bTSkip = 0;
1344
1345
0
            int checkTransformSkip = 1;
1346
0
            for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
1347
0
            {
1348
0
                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
1349
0
                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
1350
0
                uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
1351
1352
0
                primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1353
1354
0
                uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
1355
0
                if (numSig)
1356
0
                {
1357
0
                    m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
1358
0
                    bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
1359
0
                    bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1360
0
                    bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1361
0
                    bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
1362
0
                    primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
1363
0
                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1364
0
                }
1365
0
                else if (useTSkip)
1366
0
                {
1367
0
                    checkTransformSkip = 0;
1368
0
                    break;
1369
0
                }
1370
0
                else
1371
0
                {
1372
0
                    primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
1373
0
                    cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1374
0
                }
1375
0
                sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
1376
0
                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
1377
1378
0
                cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1379
1380
0
                uint32_t tmpBits = 0, tmpEnergy = 0;
1381
0
                if (numSig)
1382
0
                {
1383
0
                    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1384
0
                    m_entropyCoder.resetBits();
1385
0
                    m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
1386
0
                    tmpBits = m_entropyCoder.getNumberOfWrittenBits();
1387
0
                }
1388
1389
0
                uint64_t tmpCost;
1390
0
                if (m_rdCost.m_psyRd)
1391
0
                {
1392
0
                    tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
1393
0
                    tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
1394
0
                }
1395
0
                else if(m_rdCost.m_ssimRd)
1396
0
                {
1397
0
                    tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC);
1398
0
                    tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy);
1399
0
                }
1400
0
                else
1401
0
                    tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
1402
1403
0
                if (tmpCost < bCost)
1404
0
                {
1405
0
                    bCost = tmpCost;
1406
0
                    bDist = tmpDist;
1407
0
                    bTSkip = useTSkip;
1408
0
                    bCbf = !!numSig;
1409
0
                    bEnergy = tmpEnergy;
1410
0
                }
1411
0
            }
1412
1413
0
            if (bTSkip)
1414
0
            {
1415
0
                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
1416
0
                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
1417
0
            }
1418
1419
0
            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1420
0
            cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1421
1422
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1423
0
            pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1424
0
            intptr_t picStride = reconPic->m_strideC;
1425
0
            primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
1426
1427
0
            outCost.distortion += bDist;
1428
0
            outCost.energy += bEnergy;
1429
0
        }
1430
0
    }
1431
0
    while (tuIterator.isNextSection());
1432
1433
0
    if (splitType == VERTICAL_SPLIT)
1434
0
    {
1435
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1436
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1437
0
    }
1438
1439
0
    m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1440
0
}
1441
1442
void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
1443
0
{
1444
0
    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1445
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1446
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1447
1448
0
    if (tuDepthL == tuDepth || log2TrSizeC == 2)
1449
0
    {
1450
        // copy transform coefficients
1451
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1452
0
        uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1453
1454
0
        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
1455
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1456
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1457
0
        coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1458
0
        coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1459
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1460
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1461
1462
        // copy reconstruction
1463
0
        m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1464
0
    }
1465
0
    else
1466
0
    {
1467
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1468
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1469
0
            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1470
0
    }
1471
0
}
1472
1473
void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth)
1474
0
{
1475
0
    CUData& cu = mode.cu;
1476
0
    uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth;
1477
1478
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
1479
0
    {
1480
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1481
0
        uint32_t splitCbfU = 0, splitCbfV = 0;
1482
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1483
0
        {
1484
0
            residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1);
1485
0
            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1486
0
            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1487
0
        }
1488
0
        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
1489
0
        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
1490
1491
0
        return;
1492
0
    }
1493
1494
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1495
0
    uint32_t tuDepthC = tuDepth;
1496
0
    if (log2TrSizeC < 2)
1497
0
    {
1498
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1499
0
        if (absPartIdx & 3)
1500
0
            return;
1501
0
        log2TrSizeC = 2;
1502
0
        tuDepthC--;
1503
0
    }
1504
1505
0
    ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1506
0
    uint32_t stride = mode.fencYuv->m_csize;
1507
0
    const uint32_t sizeIdxC = log2TrSizeC - 2;
1508
1509
0
    uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
1510
0
    const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1511
1512
0
    TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1513
0
    do
1514
0
    {
1515
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1516
1517
0
        IntraNeighbors intraNeighbors;
1518
0
        initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors);
1519
1520
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1521
0
        {
1522
0
            TextType ttype = (TextType)chromaId;
1523
1524
0
            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1525
0
            pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1526
0
            int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1527
0
            uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1528
0
            coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
1529
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
1530
0
            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
1531
0
            intptr_t picStride = reconPic->m_strideC;
1532
1533
0
            uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1534
0
            if (chromaPredMode == DM_CHROMA_IDX)
1535
0
                chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1536
0
            if (m_csp == X265_CSP_I422)
1537
0
                chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1538
1539
            // init availability pattern
1540
0
            initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
1541
1542
            // get prediction signal
1543
0
            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
1544
1545
0
            X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1546
1547
0
            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
1548
1549
0
            uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
1550
0
            if (numSig)
1551
0
            {
1552
0
                m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
1553
0
                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
1554
0
                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
1555
0
                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
1556
0
                bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
1557
0
                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
1558
0
                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1559
0
            }
1560
0
            else
1561
0
            {
1562
                // no coded residual, recon = pred
1563
0
                primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride);
1564
0
                cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1565
0
            }
1566
0
        }
1567
0
    }
1568
0
    while (tuIterator.isNextSection());
1569
1570
0
    if (splitType == VERTICAL_SPLIT)
1571
0
    {
1572
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
1573
0
        offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
1574
0
    }
1575
0
}
1576
1577
void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
1578
0
{
1579
0
    CUData& cu = intraMode.cu;
1580
1581
0
    cu.setPartSizeSubParts(partSize);
1582
0
    cu.setPredModeSubParts(MODE_INTRA);
1583
1584
0
    uint32_t tuDepthRange[2];
1585
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1586
1587
0
    intraMode.initCosts();
1588
0
    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
1589
0
    if (m_csp != X265_CSP_I400)
1590
0
    {
1591
0
        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
1592
0
        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
1593
0
    }
1594
0
    else
1595
0
        intraMode.distortion += intraMode.lumaDistortion;
1596
0
    cu.m_distortion[0] = intraMode.distortion;
1597
0
    m_entropyCoder.resetBits();
1598
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1599
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1600
1601
0
    int skipFlagBits = 0;
1602
0
    if (!m_slice->isIntra())
1603
0
    {
1604
0
        m_entropyCoder.codeSkipFlag(cu, 0);
1605
0
        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1606
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
1607
0
    }
1608
1609
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1610
0
    m_entropyCoder.codePredInfo(cu, 0);
1611
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1612
1613
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1614
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1615
0
    m_entropyCoder.store(intraMode.contexts);
1616
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1617
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1618
0
    const Yuv* fencYuv = intraMode.fencYuv;
1619
0
    if (m_rdCost.m_psyRd)
1620
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1621
0
    else if(m_rdCost.m_ssimRd)
1622
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1623
1624
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1625
1626
0
    updateModeCost(intraMode);
1627
0
    checkDQP(intraMode, cuGeom);
1628
1629
#if ENABLE_SCC_EXT
1630
    if (m_param->bEnableSCC)
1631
        intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
1632
#endif
1633
0
}
1634
1635
/* Note that this function does not save the best intra prediction, it must
1636
 * be generated later. It records the best mode in the cu */
1637
void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1638
0
{
1639
0
    ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1640
1641
0
    CUData& cu = intraMode.cu;
1642
0
    uint32_t depth = cuGeom.depth;
1643
1644
0
    cu.setPartSizeSubParts(SIZE_2Nx2N);
1645
0
    cu.setPredModeSubParts(MODE_INTRA);
1646
1647
0
    const uint32_t initTuDepth = 0;
1648
0
    uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;
1649
0
    uint32_t tuSize = 1 << log2TrSize;
1650
0
    const uint32_t absPartIdx = 0;
1651
1652
    // Reference sample smoothing
1653
0
    IntraNeighbors intraNeighbors;
1654
0
    initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1655
0
    initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1656
1657
0
    const pixel* fenc = intraMode.fencYuv->m_buf[0];
1658
0
    uint32_t stride = intraMode.fencYuv->m_size;
1659
1660
0
    int sad, bsad;
1661
0
    uint32_t bits, bbits, mode, bmode;
1662
0
    uint64_t cost, bcost;
1663
1664
    // 33 Angle modes once
1665
0
    int scaleTuSize = tuSize;
1666
0
    int scaleStride = stride;
1667
0
    int costShift = 0;
1668
0
    int sizeIdx = log2TrSize - 2;
1669
1670
0
    if (tuSize > 32)
1671
0
    {
1672
        // CU is 64x64, we scale to 32x32 and adjust required parameters
1673
0
        primitives.scale2D_64to32(m_fencScaled, fenc, stride);
1674
0
        fenc = m_fencScaled;
1675
1676
0
        pixel nScale[129];
1677
0
        intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
1678
0
        primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
1679
1680
        // we do not estimate filtering for downscaled samples
1681
0
        memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));   // Top & Left pixels
1682
0
        memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
1683
1684
0
        scaleTuSize = 32;
1685
0
        scaleStride = 32;
1686
0
        costShift = 2;
1687
0
        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1688
0
    }
1689
1690
0
    pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1691
0
    int predsize = scaleTuSize * scaleTuSize;
1692
1693
0
    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1694
1695
    /* there are three cost tiers for intra modes:
1696
     *  pred[0]          - mode probable, least cost
1697
     *  pred[1], pred[2] - less probable, slightly more cost
1698
     *  non-mpm modes    - all cost the same (rbits) */
1699
0
    uint64_t mpms;
1700
0
    uint32_t mpmModes[3];
1701
0
    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1702
1703
    // DC
1704
0
    primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1705
0
    bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1706
0
    bmode = mode = DC_IDX;
1707
0
    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1708
0
    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1709
1710
    // PLANAR
1711
0
    pixel* planar = intraNeighbourBuf[0];
1712
0
    if (tuSize & (8 | 16 | 32))
1713
0
        planar = intraNeighbourBuf[1];
1714
1715
0
    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
1716
0
    sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
1717
0
    mode = PLANAR_IDX;
1718
0
    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1719
0
    cost = m_rdCost.calcRdSADCost(sad, bits);
1720
0
    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1721
1722
0
    bool allangs = true;
1723
0
    if (primitives.cu[sizeIdx].intra_pred_allangs)
1724
0
    {
1725
0
        primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1726
0
        primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
1727
0
    }
1728
0
    else
1729
0
        allangs = false;
1730
1731
0
#define TRY_ANGLE(angle) \
1732
0
    if (allangs) { \
1733
0
        if (angle < 18) \
1734
0
            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1735
0
        else \
1736
0
            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
1737
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1738
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1739
0
    } else { \
1740
0
        int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
1741
0
        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
1742
0
        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
1743
0
        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
1744
0
        cost = m_rdCost.calcRdSADCost(sad, bits); \
1745
0
    }
1746
1747
0
    if (m_param->bEnableFastIntra)
1748
0
    {
1749
0
        int asad = 0;
1750
0
        uint32_t lowmode, highmode, amode = 5, abits = 0;
1751
0
        uint64_t acost = MAX_INT64;
1752
1753
        /* pick the best angle, sampling at distance of 5 */
1754
0
        for (mode = 5; mode < 35; mode += 5)
1755
0
        {
1756
0
            TRY_ANGLE(mode);
1757
0
            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1758
0
        }
1759
1760
        /* refine best angle at distance 2, then distance 1 */
1761
0
        for (uint32_t dist = 2; dist >= 1; dist--)
1762
0
        {
1763
0
            lowmode = amode - dist;
1764
0
            highmode = amode + dist;
1765
1766
0
            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1767
0
            TRY_ANGLE(lowmode);
1768
0
            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1769
1770
0
            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1771
0
            TRY_ANGLE(highmode);
1772
0
            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1773
0
        }
1774
1775
0
        if (amode == 33)
1776
0
        {
1777
0
            TRY_ANGLE(34);
1778
0
            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1779
0
        }
1780
1781
0
        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1782
0
    }
1783
0
    else // calculate and search all intra prediction angles for lowest cost
1784
0
    {
1785
0
        for (mode = 2; mode < 35; mode++)
1786
0
        {
1787
0
            TRY_ANGLE(mode);
1788
0
            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1789
0
        }
1790
0
    }
1791
1792
0
    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1793
0
    intraMode.initCosts();
1794
0
    intraMode.totalBits = bbits;
1795
0
    intraMode.distortion = bsad;
1796
0
    intraMode.sa8dCost = bcost;
1797
0
    intraMode.sa8dBits = bbits;
1798
0
}
1799
1800
void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1801
0
{
1802
0
    ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1803
1804
0
    CUData& cu = intraMode.cu;
1805
0
    Yuv* reconYuv = &intraMode.reconYuv;
1806
1807
0
    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1808
0
    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1809
1810
0
    uint32_t tuDepthRange[2];
1811
0
    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1812
1813
0
    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1814
1815
0
    Cost icosts;
1816
0
    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1817
0
    extractIntraResultQT(cu, *reconYuv, 0, 0);
1818
1819
0
    intraMode.lumaDistortion = icosts.distortion;
1820
0
    if (m_csp != X265_CSP_I400)
1821
0
    {
1822
0
        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
1823
0
        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
1824
0
    }
1825
0
    else
1826
0
        intraMode.distortion = intraMode.lumaDistortion;
1827
1828
0
    m_entropyCoder.resetBits();
1829
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
1830
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1831
0
    m_entropyCoder.codeSkipFlag(cu, 0);
1832
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
1833
0
    m_entropyCoder.codePredMode(cu.m_predMode[0]);
1834
0
    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1835
0
    m_entropyCoder.codePredInfo(cu, 0);
1836
0
    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
1837
1838
0
    bool bCodeDQP = m_slice->m_pps->bUseDQP;
1839
0
    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1840
1841
0
    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1842
0
    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
1843
0
    const Yuv* fencYuv = intraMode.fencYuv;
1844
0
    if (m_rdCost.m_psyRd)
1845
0
        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1846
0
    else if(m_rdCost.m_ssimRd)
1847
0
        intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
1848
1849
0
    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
1850
0
    m_entropyCoder.store(intraMode.contexts);
1851
0
    updateModeCost(intraMode);
1852
0
    checkDQP(intraMode, cuGeom);
1853
0
}
1854
1855
sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
1856
0
{
1857
0
    CUData& cu = intraMode.cu;
1858
0
    Yuv* reconYuv = &intraMode.reconYuv;
1859
0
    Yuv* predYuv = &intraMode.predYuv;
1860
0
    const Yuv* fencYuv = intraMode.fencYuv;
1861
1862
0
    uint32_t depth        = cuGeom.depth;
1863
0
    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
1864
0
    uint32_t numPU        = 1 << (2 * initTuDepth);
1865
0
    uint32_t log2TrSize   = cuGeom.log2CUSize - initTuDepth;
1866
0
    uint32_t tuSize       = 1 << log2TrSize;
1867
0
    uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1868
0
    uint32_t sizeIdx      = log2TrSize - 2;
1869
0
    uint32_t absPartIdx   = 0;
1870
0
    sse_t totalDistortion = 0;
1871
1872
0
    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1873
1874
    // loop over partitions
1875
0
    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1876
0
    {
1877
0
        uint32_t bmode = 0;
1878
1879
0
        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
1880
0
            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
1881
0
        else
1882
0
        {
1883
0
            uint64_t candCostList[MAX_RD_INTRA_MODES];
1884
0
            uint32_t rdModeList[MAX_RD_INTRA_MODES];
1885
0
            uint64_t bcost;
1886
0
            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1887
1888
0
            {
1889
0
                ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
1890
1891
                // Reference sample smoothing
1892
0
                IntraNeighbors intraNeighbors;
1893
0
                initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
1894
0
                initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
1895
1896
                // determine set of modes to be tested (using prediction signal only)
1897
0
                const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1898
0
                uint32_t stride = predYuv->m_size;
1899
1900
0
                int scaleTuSize = tuSize;
1901
0
                int scaleStride = stride;
1902
0
                int costShift = 0;
1903
1904
0
                m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1905
1906
                /* there are three cost tiers for intra modes:
1907
                *  pred[0]          - mode probable, least cost
1908
                *  pred[1], pred[2] - less probable, slightly more cost
1909
                *  non-mpm modes    - all cost the same (rbits) */
1910
0
                uint64_t mpms;
1911
0
                uint32_t mpmModes[3];
1912
0
                uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
1913
1914
0
                pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
1915
0
                uint64_t modeCosts[35];
1916
1917
                // DC
1918
0
                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
1919
0
                uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
1920
0
                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1921
0
                modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1922
1923
                // PLANAR
1924
0
                pixel* planar = intraNeighbourBuf[0];
1925
0
                if (tuSize >= 8 && tuSize <= 32)
1926
0
                    planar = intraNeighbourBuf[1];
1927
1928
0
                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
1929
0
                bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
1930
0
                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
1931
0
                modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1932
0
                COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1933
1934
                // angular predictions
1935
0
                if (primitives.cu[sizeIdx].intra_pred_allangs)
1936
0
                {
1937
0
                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
1938
0
                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
1939
0
                    for (int mode = 2; mode < 35; mode++)
1940
0
                    {
1941
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1942
0
                        if (mode < 18)
1943
0
                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1944
0
                        else
1945
0
                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1946
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1947
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1948
0
                    }
1949
0
                }
1950
0
                else
1951
0
                {
1952
0
                    for (int mode = 2; mode < 35; mode++)
1953
0
                    {
1954
0
                        bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
1955
0
                        int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
1956
0
                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
1957
0
                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
1958
0
                        modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1959
0
                        COPY1_IF_LT(bcost, modeCosts[mode]);
1960
0
                    }
1961
0
                }
1962
1963
                /* Find the top maxCandCount candidate modes with cost within 25% of best
1964
                * or among the most probable modes. maxCandCount is derived from the
1965
                * rdLevel and depth. In general we want to try more modes at slower RD
1966
                * levels and at higher depths */
1967
0
                for (int i = 0; i < maxCandCount; i++)
1968
0
                    candCostList[i] = MAX_INT64;
1969
1970
0
                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
1971
0
                for (int mode = 0; mode < 35; mode++)
1972
0
                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
1973
                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
1974
0
                        updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1975
0
            }
1976
1977
            /* measure best candidates using simple RDO (no TU splits) */
1978
0
            bcost = MAX_INT64;
1979
0
            for (int i = 0; i < maxCandCount; i++)
1980
0
            {
1981
0
                if (candCostList[i] == MAX_INT64)
1982
0
                    break;
1983
1984
0
                ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1985
1986
0
                m_entropyCoder.load(m_rqt[depth].cur);
1987
0
                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1988
1989
0
                Cost icosts;
1990
0
                if (checkTransformSkip)
1991
0
                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1992
0
                else
1993
0
                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1994
0
                COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1995
0
            }
1996
0
        }
1997
1998
0
        ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
1999
2000
        /* remeasure best mode, allowing TU splits */
2001
0
        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
2002
0
        m_entropyCoder.load(m_rqt[depth].cur);
2003
2004
0
        Cost icosts;
2005
0
        if (checkTransformSkip)
2006
0
            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
2007
0
        else
2008
0
            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
2009
0
        totalDistortion += icosts.distortion;
2010
2011
0
        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
2012
2013
        // set reconstruction for next intra prediction blocks
2014
0
        if (puIdx != numPU - 1)
2015
0
        {
2016
            /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
2017
             * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
2018
             * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
2019
             * that the contexts should be tracked through each PU */
2020
0
            PicYuv*  reconPic = m_frame->m_reconPic[0];
2021
0
            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
2022
0
            uint32_t dststride = reconPic->m_stride;
2023
0
            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
2024
0
            uint32_t srcstride = reconYuv->m_size;
2025
0
            primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
2026
0
        }
2027
0
    }
2028
2029
0
    if (numPU > 1)
2030
0
    {
2031
0
        uint32_t combCbfY = 0;
2032
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2033
0
            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
2034
2035
0
        cu.m_cbf[0][0] |= combCbfY;
2036
0
    }
2037
2038
    // TODO: remove this
2039
0
    m_entropyCoder.load(m_rqt[depth].cur);
2040
2041
0
    return totalDistortion;
2042
0
}
2043
2044
void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
2045
0
{
2046
0
    CUData& cu = intraMode.cu;
2047
0
    const Yuv* fencYuv = intraMode.fencYuv;
2048
0
    Yuv* predYuv = &intraMode.predYuv;
2049
2050
0
    uint32_t bestMode  = 0;
2051
0
    uint64_t bestCost  = MAX_INT64;
2052
0
    uint32_t modeList[NUM_CHROMA_MODE];
2053
2054
0
    uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
2055
0
    uint32_t tuSize = 1 << log2TrSizeC;
2056
0
    uint32_t tuDepth = 0;
2057
0
    int32_t costShift = 0;
2058
2059
0
    if (tuSize > 32)
2060
0
    {
2061
0
        tuDepth = 1;
2062
0
        costShift = 2;
2063
0
        log2TrSizeC = 5;
2064
0
    }
2065
2066
0
    IntraNeighbors intraNeighbors;
2067
0
    initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
2068
0
    cu.getAllowedChromaDir(0, modeList);
2069
2070
    // check chroma modes
2071
0
    for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
2072
0
    {
2073
0
        uint32_t chromaPredMode = modeList[mode];
2074
0
        if (chromaPredMode == DM_CHROMA_IDX)
2075
0
            chromaPredMode = cu.m_lumaIntraDir[0];
2076
0
        if (m_csp == X265_CSP_I422)
2077
0
            chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
2078
2079
0
        uint64_t cost = 0;
2080
0
        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2081
0
        {
2082
0
            const pixel* fenc = fencYuv->m_buf[chromaId];
2083
0
            pixel* pred = predYuv->m_buf[chromaId];
2084
0
            Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
2085
            // get prediction signal
2086
0
            predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC);
2087
0
            cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
2088
0
        }
2089
2090
0
        if (cost < bestCost)
2091
0
        {
2092
0
            bestCost = cost;
2093
0
            bestMode = modeList[mode];
2094
0
        }
2095
0
    }
2096
2097
0
    cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
2098
0
}
2099
2100
sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
2101
0
{
2102
0
    CUData& cu = intraMode.cu;
2103
0
    Yuv& reconYuv = intraMode.reconYuv;
2104
2105
0
    uint32_t depth       = cuGeom.depth;
2106
0
    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
2107
0
    uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
2108
0
    uint32_t absPartStep = cuGeom.numPartitions;
2109
0
    sse_t totalDistortion = 0;
2110
2111
0
    int size = partitionFromLog2Size(log2TrSize);
2112
2113
0
    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
2114
2115
0
    do
2116
0
    {
2117
0
        uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2118
2119
0
        uint32_t bestMode = 0;
2120
0
        sse_t bestDist = 0;
2121
0
        uint64_t bestCost = MAX_INT64;
2122
2123
        // init mode list
2124
0
        uint32_t minMode = 0;
2125
0
        uint32_t maxMode = NUM_CHROMA_MODE;
2126
0
        uint32_t modeList[NUM_CHROMA_MODE];
2127
2128
0
        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
2129
0
        {
2130
0
            for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
2131
0
                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
2132
0
            maxMode = 1;
2133
0
        }
2134
0
        else
2135
0
            cu.getAllowedChromaDir(absPartIdxC, modeList);
2136
2137
0
        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
2138
0
        {
2139
0
            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
2140
0
                modeList[l] = modeList[0];
2141
0
            maxMode = 1;
2142
0
        }
2143
        // check chroma modes
2144
0
        for (uint32_t mode = minMode; mode < maxMode; mode++)
2145
0
        {
2146
            // restore context models
2147
0
            m_entropyCoder.load(m_rqt[depth].cur);
2148
2149
0
            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
2150
0
            Cost outCost;
2151
0
            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
2152
2153
0
            if (m_slice->m_pps->bTransformSkipEnabled)
2154
0
                m_entropyCoder.load(m_rqt[depth].cur);
2155
2156
0
            m_entropyCoder.resetBits();
2157
            // chroma prediction mode
2158
0
            if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
2159
0
            {
2160
0
                if (!absPartIdxC)
2161
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
2162
0
            }
2163
0
            else
2164
0
            {
2165
0
                uint32_t qNumParts = cuGeom.numPartitions >> 2;
2166
0
                if (!(absPartIdxC & (qNumParts - 1)))
2167
0
                    m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
2168
0
            }
2169
2170
0
            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
2171
0
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
2172
0
            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
2173
0
            uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
2174
0
            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy)
2175
0
                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
2176
2177
0
            if (cost < bestCost)
2178
0
            {
2179
0
                bestCost = cost;
2180
0
                bestDist = outCost.distortion;
2181
0
                bestMode = modeList[mode];
2182
0
                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
2183
0
                memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2184
0
                memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2185
0
                memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2186
0
                memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
2187
0
            }
2188
0
        }
2189
2190
0
        if (!tuIterator.isLastSection())
2191
0
        {
2192
0
            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
2193
0
            PicYuv*  reconPic  = m_frame->m_reconPic[0];
2194
0
            uint32_t dststride = reconPic->m_strideC;
2195
0
            const pixel* src;
2196
0
            pixel* dst;
2197
2198
0
            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
2199
0
            src = reconYuv.getCbAddr(absPartIdxC);
2200
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
2201
2202
0
            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
2203
0
            src = reconYuv.getCrAddr(absPartIdxC);
2204
0
            primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
2205
0
        }
2206
2207
0
        memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
2208
0
        memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
2209
0
        memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
2210
0
        memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
2211
0
        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
2212
0
        totalDistortion += bestDist;
2213
0
    }
2214
0
    while (tuIterator.isNextSection());
2215
2216
0
    if (initTuDepth != 0)
2217
0
    {
2218
0
        uint32_t combCbfU = 0;
2219
0
        uint32_t combCbfV = 0;
2220
0
        uint32_t qNumParts = tuIterator.absPartIdxStep;
2221
0
        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2222
0
        {
2223
0
            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
2224
0
            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
2225
0
        }
2226
2227
0
        cu.m_cbf[1][0] |= combCbfU;
2228
0
        cu.m_cbf[2][0] |= combCbfV;
2229
0
    }
2230
2231
    /* TODO: remove this */
2232
0
    m_entropyCoder.load(m_rqt[depth].cur);
2233
0
    return totalDistortion;
2234
0
}
2235
2236
/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
2237
uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
2238
0
{
2239
0
    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
2240
2241
0
    MVField  candMvField[MRG_MAX_NUM_CANDS][2];
2242
0
    uint8_t  candDir[MRG_MAX_NUM_CANDS];
2243
0
    uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
2244
#if ENABLE_SCC_EXT
2245
    restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand);
2246
#else
2247
0
    if (cu.isBipredRestriction())
2248
0
    {
2249
        /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
2250
0
        for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
2251
0
        {
2252
0
            if (candDir[mergeCand] == 3)
2253
0
            {
2254
0
                candDir[mergeCand] = 1;
2255
0
                candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
2256
0
            }
2257
0
        }
2258
0
    }
2259
0
#endif
2260
2261
0
    Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2262
2263
0
    uint32_t outCost = MAX_UINT;
2264
0
    for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
2265
0
    {
2266
        /* Prevent TMVP candidates from using unavailable reference pixels */
2267
0
        if (m_bFrameParallel)
2268
0
        {
2269
            // Parallel slices bound check
2270
0
            if (m_param->maxSlices > 1)
2271
0
            {
2272
0
                if (cu.m_bFirstRowInSlice &
2273
0
                    ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4))))
2274
0
                    continue;
2275
2276
                // Last row in slice can't reference beyond bound since it is another slice area
2277
                // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
2278
0
                if (cu.m_bLastRowInSlice &&
2279
0
                    ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4)))
2280
0
                    continue;
2281
0
            }
2282
2283
0
            if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
2284
0
                candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)
2285
0
                continue;
2286
0
        }
2287
2288
#if ENABLE_SCC_EXT
2289
        if ((candDir[mergeCand] == 1 || candDir[mergeCand] == 3) && (m_slice->m_refPOCList[0][candMvField[mergeCand][0].refIdx] == m_slice->m_poc))
2290
        {
2291
            continue;
2292
        }
2293
#endif
2294
0
        cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
2295
0
        cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
2296
0
        cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
2297
0
        cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
2298
2299
0
        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
2300
2301
0
        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
2302
0
        if (m_me.bChromaSATD)
2303
0
            costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
2304
2305
0
        uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
2306
0
        costCand = costCand + m_rdCost.getCost(bitsCand);
2307
0
        if (costCand < outCost)
2308
0
        {
2309
0
            outCost = costCand;
2310
0
            m.bits = bitsCand;
2311
0
            m.index = mergeCand;
2312
0
        }
2313
0
    }
2314
2315
0
    m.mvField[0] = candMvField[m.index][0];
2316
0
    m.mvField[1] = candMvField[m.index][1];
2317
0
    m.dir = candDir[m.index];
2318
2319
0
    return outCost;
2320
0
}
2321
2322
/* find the lowres motion vector from lookahead in middle of current PU */
2323
MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
2324
0
{
2325
0
    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
2326
0
    if (diffPoc > m_param->bframes + 1)
2327
        /* poc difference is out of range for lookahead */
2328
0
        return 0;
2329
2330
0
    MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc];
2331
0
    if (mvs[0].x == 0x7FFF)
2332
        /* this motion search was not estimated by lookahead */
2333
0
        return 0;
2334
2335
0
    uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4;
2336
0
    uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4;
2337
0
    uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x;
2338
2339
0
    X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n");
2340
0
    X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n");
2341
2342
0
    return mvs[idx] << 1; /* scale up lowres mv */
2343
0
}
2344
2345
/* Pick between the two AMVP candidates which is the best one to use as
2346
 * MVP for the motion search, based on SAD cost */
2347
int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref)
2348
0
{
2349
0
    if (amvp[0] == amvp[1])
2350
0
        return 0;
2351
2352
0
    Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv;
2353
0
    uint32_t costs[AMVP_NUM_CANDS];
2354
2355
0
    for (int i = 0; i < AMVP_NUM_CANDS; i++)
2356
0
    {
2357
0
        MV mvCand = amvp[i];
2358
2359
        // NOTE: skip mvCand if Y is > merange and -FN>1
2360
0
        if (m_bFrameParallel)
2361
0
        {
2362
0
            costs[i] = m_me.COST_MAX;
2363
2364
0
            if (mvCand.y >= (m_param->searchRange + 1) * 4)
2365
0
                continue;
2366
2367
0
            if ((m_param->maxSlices > 1) &
2368
0
                ((mvCand.y < m_sliceMinY)
2369
0
              |  (mvCand.y > m_sliceMaxY)))
2370
0
                continue;
2371
0
        }
2372
0
        cu.clipMv(mvCand);
2373
#if ENABLE_SCC_EXT
2374
        if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx[0] - 1)
2375
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameList[list][ref]->m_reconPic[1], mvCand);
2376
        else
2377
#endif
2378
0
            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
2379
0
        costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
2380
0
    }
2381
2382
0
    return (costs[0] <= costs[1]) ? 0 : 1;
2383
0
}
2384
2385
void Search::PME::processTasks(int workerThreadId)
2386
0
{
2387
#if DETAILED_CU_STATS
2388
    int fe = mode.cu.m_encData->m_frameEncoderID;
2389
    master.m_stats[fe].countPMETasks++;
2390
    ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
2391
#endif
2392
0
    ProfileScopeEvent(pme);
2393
0
    master.processPME(*this, master.m_tld[workerThreadId].analysis);
2394
0
}
2395
2396
void Search::processPME(PME& pme, Search& slave)
2397
0
{
2398
    /* acquire a motion estimation job, else exit early */
2399
0
    int meId;
2400
0
    pme.m_lock.acquire();
2401
0
    if (pme.m_jobTotal > pme.m_jobAcquired)
2402
0
    {
2403
0
        meId = pme.m_jobAcquired++;
2404
0
        pme.m_lock.release();
2405
0
    }
2406
0
    else
2407
0
    {
2408
0
        pme.m_lock.release();
2409
0
        return;
2410
0
    }
2411
2412
    /* Setup slave Search instance for ME for master's CU */
2413
0
    if (&slave != this)
2414
0
    {
2415
0
        slave.m_slice = m_slice;
2416
0
        slave.m_frame = m_frame;
2417
0
        slave.m_param = m_param;
2418
0
        slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
2419
0
        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
2420
0
        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
2421
0
    }
2422
2423
    /* Perform ME, repeat until no more work is available */
2424
0
    do
2425
0
    {
2426
0
        if (meId < pme.m_jobs.refCnt[0])
2427
0
        {
2428
0
            int refIdx = pme.m_jobs.ref[0][meId]; //L0
2429
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx);
2430
0
        }
2431
0
        else
2432
0
        {
2433
0
            int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1
2434
0
            slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx);
2435
0
        }
2436
2437
0
        meId = -1;
2438
0
        pme.m_lock.acquire();
2439
0
        if (pme.m_jobTotal > pme.m_jobAcquired)
2440
0
            meId = pme.m_jobAcquired++;
2441
0
        pme.m_lock.release();
2442
0
    }
2443
0
    while (meId >= 0);
2444
0
}
2445
2446
void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref)
2447
0
{
2448
0
    uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
2449
0
    int numIdx = m_slice->m_numRefIdx[list];
2450
#if ENABLE_SCC_EXT
2451
    if (!list && m_ibcEnabled)
2452
        numIdx--;
2453
#endif
2454
0
    bits += getTUBits(ref, numIdx);
2455
2456
0
    MotionData* bestME = interMode.bestME[part];
2457
2458
    // 12 mv candidates including lowresMV
2459
0
    MV  mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2460
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2461
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, 0, pu.puAbsPartIdx);
2462
#else
2463
0
    int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2464
0
#endif
2465
2466
0
    const MV* amvp = interMode.amvpCand[list][ref];
2467
0
    int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
2468
0
    bool bLowresMVP = false;
2469
0
    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2470
2471
0
    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging if lowresMV is not available */
2472
0
    {
2473
0
        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
2474
0
        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2475
0
        if (lmv.notZero() && !layer)
2476
0
            mvc[numMvc++] = lmv;
2477
0
        if (m_param->bEnableHME)
2478
0
            mvp_lowres = lmv;
2479
0
    }
2480
2481
0
    m_vertRestriction = interMode.cu.m_slice->m_refPOCList[list][ref] == interMode.cu.m_slice->m_poc;
2482
0
    setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
2483
2484
0
    int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2485
0
      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2486
2487
0
    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2488
0
    {
2489
0
        MV outmv_lowres;
2490
0
        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2491
0
        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2492
0
            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2493
0
        if (lowresMvCost < satdCost)
2494
0
        {
2495
0
            outmv = outmv_lowres;
2496
0
            satdCost = lowresMvCost;
2497
0
            bLowresMVP = true;
2498
0
        }
2499
0
    }
2500
    /* Get total cost of partition, but only include MV bit cost once */
2501
0
    bits += m_me.bitcost(outmv);
2502
0
    uint32_t mvCost = m_me.mvcost(outmv);
2503
0
    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2504
2505
    /* Update LowresMVP to best AMVP cand*/
2506
0
    if (bLowresMVP)
2507
0
        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2508
2509
    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2510
0
    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2511
2512
    /* tie goes to the smallest ref ID, just like --no-pme */
2513
0
    ScopedLock _lock(master.m_meLock);
2514
0
    if (cost < bestME[list].cost ||
2515
0
       (cost == bestME[list].cost && ref < bestME[list].ref))
2516
0
    {
2517
0
        bestME[list].mv = outmv;
2518
0
        bestME[list].mvp = mvp;
2519
0
        bestME[list].mvpIdx = mvpIdx;
2520
0
        bestME[list].ref = ref;
2521
0
        bestME[list].cost = cost;
2522
0
        bestME[list].bits = bits;
2523
0
        bestME[list].mvCost  = mvCost;
2524
0
    }
2525
0
}
2526
void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc)
2527
0
{
2528
0
    CUData& cu = interMode.cu;
2529
0
    MV mv, mvmin, mvmax;
2530
0
    int cand = 0, bestcost = INT_MAX;
2531
0
    while (cand < m_param->mvRefine)
2532
0
    {
2533
0
        if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1])))
2534
0
        {
2535
0
            cand++;
2536
0
            continue;
2537
0
        }
2538
0
        MV bestMV;
2539
0
        mv = mvp[cand++];
2540
0
        cu.clipMv(mv);
2541
0
        m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2542
0
        setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
2543
0
        int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices, m_vertRestriction,
2544
0
        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2545
0
        if (bestcost > cost)
2546
0
        {
2547
0
            bestcost = cost;
2548
0
            outmv = bestMV;
2549
0
        }
2550
0
    }
2551
0
}
2552
/* find the best inter prediction for each PU of specified mode */
2553
#if ENABLE_SCC_EXT
2554
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2], MV* iMVCandList)
2555
#else
2556
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
2557
#endif
2558
0
{
2559
0
    ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
2560
2561
0
    CUData& cu = interMode.cu;
2562
0
    Yuv* predYuv = &interMode.predYuv;
2563
2564
    // 12 mv candidates including lowresMV
2565
0
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
2566
2567
0
    const Slice *slice = m_slice;
2568
0
    int numPart     = cu.getNumPartInter(0);
2569
0
    int numPredDir  = slice->isInterP() ? 1 : 2;
2570
0
    const int* numRefIdx = slice->m_numRefIdx;
2571
0
    uint32_t lastMode = 0;
2572
0
    int      totalmebits = 0;
2573
0
    MV       mvzero(0, 0);
2574
0
    Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
2575
0
    MergeData merge;
2576
0
    memset(&merge, 0, sizeof(merge));
2577
0
    bool useAsMVP = false;
2578
0
    for (int puIdx = 0; puIdx < numPart; puIdx++)
2579
0
    {
2580
0
        MotionData* bestME = interMode.bestME[puIdx];
2581
0
        PredictionUnit pu(cu, cuGeom, puIdx);
2582
0
        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
2583
0
        useAsMVP = false;
2584
0
        x265_analysis_inter_data* interDataCTU = NULL;
2585
0
        int cuIdx;
2586
0
        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
2587
0
        if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1)
2588
0
        {
2589
0
            interDataCTU = m_frame->m_analysisData.interData;
2590
0
            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
2591
0
                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
2592
0
                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
2593
0
                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
2594
0
                useAsMVP = true;
2595
0
        }
2596
        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
2597
0
        uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
2598
0
        bestME[0].cost = MAX_UINT;
2599
0
        bestME[1].cost = MAX_UINT;
2600
2601
0
        getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
2602
0
        bool bDoUnidir = true;
2603
0
        bool useThreadedME = false;
2604
0
        bool threadedBidir = false;
2605
0
        bool threadedUniL0 = false;
2606
0
        bool threadedUniL1 = false;
2607
0
        MEData threadedMEData;
2608
2609
0
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
2610
0
        if (m_param->bThreadedME)
2611
0
        {
2612
0
            int cuSize = 1 << cu.m_log2CUSize[0];
2613
0
            int lookupWidth = pu.width;
2614
0
            int lookupHeight = pu.height;
2615
0
            bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU;
2616
2617
0
            if (isAmp)
2618
0
            {
2619
0
                if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == SIZE_2NxnD)
2620
0
                    lookupHeight = puIdx ? (pu.width - pu.height) : pu.height;
2621
0
                else
2622
0
                    lookupWidth = puIdx ? (pu.height - pu.width) : pu.width;
2623
0
            }
2624
2625
0
            if (lookupWidth + lookupHeight <= 2 * MAX_CU_SIZE)
2626
0
            {
2627
0
                int startIdx = g_puStartIdx[lookupWidth + lookupHeight][static_cast<int>(cu.m_partSize[0])];
2628
0
                int alignWidth = isAmp ? cuSize : pu.width;
2629
0
                int alignHeight = isAmp ? cuSize : pu.height;
2630
0
                int numPUX = m_param->maxCUSize / alignWidth;
2631
0
                int numPUY = m_param->maxCUSize / alignHeight;
2632
0
                int puOffset = isAmp ? (puIdx * numPUX * numPUY) : (cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx);
2633
0
                int relX = (cu.m_cuPelX / alignWidth) % numPUX;
2634
0
                int relY = (cu.m_cuPelY / alignHeight) % numPUY;
2635
0
                int index = startIdx + (relY * numPUX + relX) + puOffset;
2636
2637
0
                if (index >= 0 && index < MAX_NUM_PUS_PER_CTU)
2638
0
                {
2639
0
                    int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
2640
0
                    int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
2641
0
                    int slotIdx = row * m_slice->m_sps->numCuInWidth + col;
2642
2643
0
                    threadedMEData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index];
2644
2645
0
                    bool validL0 = threadedMEData.ref[0] >= 0 && threadedMEData.ref[0] < numRefIdx[0];
2646
0
                    bool validL1 = numPredDir > 1 && threadedMEData.ref[1] >= 0 && threadedMEData.ref[1] < numRefIdx[1];
2647
2648
0
                    threadedBidir = validL0 && validL1;
2649
0
                    threadedUniL0 = validL0 && threadedMEData.ref[1] == REF_NOT_VALID;
2650
0
                    threadedUniL1 = validL1 && threadedMEData.ref[0] == REF_NOT_VALID;
2651
0
                    useThreadedME = threadedBidir || threadedUniL0 || threadedUniL1;
2652
0
                }
2653
0
            }
2654
0
        }
2655
2656
        /* Uni-directional prediction */
2657
0
        if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)
2658
0
            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
2659
0
        {
2660
0
            for (int list = 0; list < numPredDir; list++)
2661
0
            {
2662
2663
0
                int ref = -1;
2664
0
                if (useAsMVP)
2665
0
                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
2666
0
                else
2667
0
                    ref = bestME[list].ref;
2668
0
                if (ref < 0)
2669
0
                {
2670
0
                    continue;
2671
0
                }
2672
0
                uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2673
0
                int numIdx = m_slice->m_numRefIdx[list];
2674
#if ENABLE_SCC_EXT
2675
                if (!list && m_ibcEnabled)
2676
                    numIdx--;
2677
#endif
2678
0
                bits += getTUBits(ref, numIdx);
2679
2680
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2681
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2682
#else
2683
0
                int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2684
0
#endif
2685
0
                const MV* amvp = interMode.amvpCand[list][ref];
2686
0
                int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2687
0
                MV mvmin, mvmax, outmv, mvp;
2688
0
                if (useAsMVP)
2689
0
                {
2690
0
                    mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
2691
0
                    mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
2692
0
                }
2693
0
                else
2694
0
                    mvp = amvp[mvpIdx];
2695
0
                if (m_param->searchMethod == X265_SEA)
2696
0
                {
2697
0
                    int puX = puIdx & 1;
2698
0
                    int puY = puIdx >> 1;
2699
0
                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2700
0
                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2701
0
                }
2702
0
                setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2703
0
                MV mvpIn = mvp;
2704
0
                int satdCost;
2705
0
                if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
2706
0
                    mvpIn = bestME[list].mv;
2707
0
                if (useAsMVP && m_param->mvRefine > 1)
2708
0
                {
2709
0
                    MV bestmv, mvpSel[3];
2710
0
                    int mvpIdxSel[3];
2711
0
                    satdCost = m_me.COST_MAX;
2712
0
                    mvpSel[0] = mvp;
2713
0
                    mvpIdxSel[0] = mvpIdx;
2714
0
                    mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2715
0
                    mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
2716
0
                    mvpIdxSel[1] = mvpIdx;
2717
0
                    if (m_param->mvRefine > 2)
2718
0
                    {
2719
0
                        mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
2720
0
                        mvpIdxSel[2] = !mvpIdx;
2721
0
                    }
2722
0
                    for (int cand = 0; cand < m_param->mvRefine; cand++)
2723
0
                    {
2724
0
                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
2725
0
                            continue;
2726
0
                        setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);
2727
0
                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices, m_vertRestriction,
2728
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2729
0
                        if (satdCost > bcost)
2730
0
                        {
2731
0
                            satdCost = bcost;
2732
0
                            outmv = bestmv;
2733
0
                            mvp = mvpSel[cand];
2734
0
                            mvpIdx = mvpIdxSel[cand];
2735
0
                        }
2736
0
                    }
2737
0
                    mvpIn = mvp;
2738
0
                }
2739
0
                else
2740
0
                {
2741
0
                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2742
0
                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2743
0
                }
2744
2745
                /* Get total cost of partition, but only include MV bit cost once */
2746
0
                bits += m_me.bitcost(outmv);
2747
0
                uint32_t mvCost = m_me.mvcost(outmv);
2748
0
                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2749
                /* Refine MVP selection, updates: mvpIdx, bits, cost */
2750
0
                if (!(m_param->analysisMultiPassRefine || useAsMVP))
2751
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2752
0
                else
2753
0
                {
2754
                    /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here 
2755
                      the actual mvp is bestME from pass 1 for that mvpIdx */
2756
0
                    int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
2757
0
                    if (diffBits < 0)
2758
0
                    {
2759
0
                        mvpIdx = !mvpIdx;
2760
0
                        uint32_t origOutBits = bits;
2761
0
                        bits = origOutBits + diffBits;
2762
0
                        cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
2763
0
                    }
2764
0
                    mvp = amvp[mvpIdx];
2765
0
                }
2766
2767
0
                if (cost < bestME[list].cost)
2768
0
                {
2769
0
                    bestME[list].mv = outmv;
2770
0
                    bestME[list].mvp = mvp;
2771
0
                    bestME[list].mvpIdx = mvpIdx;
2772
0
                    bestME[list].cost = cost;
2773
0
                    bestME[list].bits = bits;
2774
0
                    bestME[list].mvCost  = mvCost;
2775
0
                    bestME[list].ref = ref;
2776
0
                }
2777
0
                bDoUnidir = false;
2778
0
            }            
2779
0
        }
2780
0
        else if (m_param->bDistributeMotionEstimation)
2781
0
        {
2782
0
            PME pme(*this, interMode, cuGeom, pu, puIdx);
2783
0
            pme.m_jobTotal = 0;
2784
0
            pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */
2785
2786
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2787
0
            for (int list = 0; list < numPredDir; list++)
2788
0
            {
2789
0
                int idx = 0;
2790
0
                int numIdx = numRefIdx[list];
2791
#if ENABLE_SCC_EXT
2792
                if (!list && m_ibcEnabled)
2793
                    numIdx--;
2794
#endif
2795
0
                for (int ref = 0; ref < numIdx; ref++)
2796
0
                {
2797
0
                    if (!(refMask & (1 << ref)))
2798
0
                        continue;
2799
2800
0
                    pme.m_jobs.ref[list][idx++]  = ref;
2801
0
                    pme.m_jobTotal++;
2802
0
                }
2803
0
                pme.m_jobs.refCnt[list] = idx;
2804
2805
                /* the second list ref bits start at bit 16 */
2806
0
                refMask >>= 16;
2807
0
            }
2808
2809
0
            if (pme.m_jobTotal > 2)
2810
0
            {
2811
0
                pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
2812
2813
0
                processPME(pme, *this);
2814
2815
0
                int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
2816
0
                singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */
2817
2818
0
                bDoUnidir = false;
2819
2820
0
                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
2821
0
                pme.waitForExit();
2822
0
            }
2823
2824
            /* if no peer threads were bonded, fall back to doing unidirectional
2825
             * searches ourselves without overhead of singleMotionEstimation() */
2826
0
        }
2827
0
        if (bDoUnidir && (!m_param->bThreadedME || !useThreadedME))
2828
0
        {
2829
0
            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
2830
0
            uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
2831
2832
0
            for (int list = 0; list < numPredDir; list++)
2833
0
            {
2834
0
                int numIdx = numRefIdx[list];
2835
#if ENABLE_SCC_EXT
2836
                if (!list && m_ibcEnabled)
2837
                    numIdx--;
2838
#endif
2839
0
                for (int ref = 0; ref < numIdx; ref++)
2840
0
                {
2841
0
                    ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
2842
2843
0
                    if (!(refMask & (1 << ref)))
2844
0
                    {
2845
0
                        ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
2846
0
                        continue;
2847
0
                    }
2848
2849
0
                    uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
2850
0
                    bits += getTUBits(ref, numIdx);
2851
2852
#if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT)
2853
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx);
2854
#else
2855
0
                    int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
2856
0
#endif
2857
2858
0
                    const MV* amvp = interMode.amvpCand[list][ref];
2859
0
                    int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
2860
0
                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
2861
0
                    bool bLowresMVP = false;
2862
2863
0
                    if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging when lowresMV is not available */
2864
0
                    {
2865
0
                        MV lmv = getLowresMV(cu, pu, list, ref);
2866
0
                        int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0;
2867
0
                        if (lmv.notZero() && !layer)
2868
0
                            mvc[numMvc++] = lmv;
2869
0
                        if (m_param->bEnableHME)
2870
0
                            mvp_lowres = lmv;
2871
0
                    }
2872
0
                    if (m_param->searchMethod == X265_SEA)
2873
0
                    {
2874
0
                        int puX = puIdx & 1;
2875
0
                        int puY = puIdx >> 1;
2876
0
                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
2877
0
                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride;
2878
0
                    }
2879
0
                    m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc;
2880
0
                    setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
2881
0
                    int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction,
2882
0
                      m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2883
2884
0
                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
2885
0
                    {
2886
0
                        MV outmv_lowres;
2887
0
                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
2888
0
                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction,
2889
0
                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
2890
0
                        if (lowresMvCost < satdCost)
2891
0
                        {
2892
0
                            outmv = outmv_lowres;
2893
0
                            satdCost = lowresMvCost;
2894
0
                            bLowresMVP = true;
2895
0
                        }
2896
0
                    }
2897
2898
                    /* Get total cost of partition, but only include MV bit cost once */
2899
0
                    bits += m_me.bitcost(outmv);
2900
0
                    uint32_t mvCost = m_me.mvcost(outmv);
2901
0
                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
2902
                    /* Update LowresMVP to best AMVP cand*/
2903
0
                    if (bLowresMVP)
2904
0
                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
2905
2906
                    /* Refine MVP selection, updates: mvpIdx, bits, cost */
2907
0
                    mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
2908
2909
#if ENABLE_SCC_EXT
2910
                    if (m_param->bEnableSCC && (list <= 1 && ref <= 1 && (cu.m_partSize[0] == SIZE_2NxN || cu.m_partSize[0] == SIZE_Nx2N) && (1 << cu.m_log2CUSize[0]) <= 16))
2911
                    {
2912
                        iMVCandList[4 * list + 2 * ref + puIdx] = outmv;
2913
                    }
2914
#endif
2915
2916
0
                    if (cost < bestME[list].cost)
2917
0
                    {
2918
0
                        bestME[list].mv      = outmv;
2919
0
                        bestME[list].mvp     = mvp;
2920
0
                        bestME[list].mvpIdx  = mvpIdx;
2921
0
                        bestME[list].ref     = ref;
2922
0
                        bestME[list].cost    = cost;
2923
0
                        bestME[list].bits    = bits;
2924
0
                        bestME[list].mvCost  = mvCost;
2925
0
                    }
2926
0
                }
2927
                /* the second list ref bits start at bit 16 */
2928
0
                refMask >>= 16;
2929
0
            }
2930
0
        }
2931
2932
        /* Bi-directional prediction */
2933
0
        MotionData bidir[2];
2934
0
        uint32_t bidirCost = MAX_UINT;
2935
0
        int bidirBits = 0;
2936
2937
0
        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
2938
0
            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
2939
0
            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && (!m_param->bThreadedME || !useThreadedME))
2940
0
        {
2941
0
            bidir[0] = bestME[0];
2942
0
            bidir[1] = bestME[1];
2943
2944
0
            int satdCost;
2945
2946
0
            if (m_me.bChromaSATD)
2947
0
            {
2948
0
                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
2949
0
                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2950
0
                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
2951
0
                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2952
0
                motionCompensation(cu, pu, tmpPredYuv, true, true);
2953
2954
0
                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
2955
0
                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
2956
0
            }
2957
0
            else
2958
0
            {
2959
0
                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
2960
0
                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
2961
0
                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2962
2963
                /* Generate reference subpels */
2964
0
                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
2965
0
                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
2966
0
                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
2967
0
                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
2968
0
                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2969
0
            }
2970
2971
0
            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2972
0
            bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2973
2974
0
            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2975
0
            if (bTryZero)
2976
0
            {
2977
                /* Do not try zero MV if unidir motion predictors are beyond
2978
                 * valid search area */
2979
0
                MV mvmin, mvmax;
2980
0
                int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2981
0
                setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2982
0
                mvmax.y += 2; // there is some pad for subpel refine
2983
0
                mvmin <<= 2;
2984
0
                mvmax <<= 2;
2985
2986
0
                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2987
0
                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2988
0
            }
2989
0
            if (bTryZero)
2990
0
            {
2991
                /* coincident blocks of the two reference pictures */
2992
0
                if (m_me.bChromaSATD)
2993
0
                {
2994
0
                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
2995
0
                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
2996
0
                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
2997
0
                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
2998
0
                    motionCompensation(cu, pu, tmpPredYuv, true, true);
2999
3000
0
                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
3001
0
                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
3002
0
                }
3003
0
                else
3004
0
                {
3005
0
                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
3006
0
                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
3007
0
                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
3008
0
                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
3009
0
                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
3010
0
                }
3011
0
                MV mvp0 = bestME[0].mvp;
3012
0
                int mvpIdx0 = bestME[0].mvpIdx;
3013
0
                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
3014
3015
0
                MV mvp1 = bestME[1].mvp;
3016
0
                int mvpIdx1 = bestME[1].mvpIdx;
3017
0
                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
3018
3019
0
                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
3020
3021
                /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
3022
0
                mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
3023
0
                mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
3024
3025
0
                if (cost < bidirCost)
3026
0
                {
3027
0
                    bidir[0].mv = mvzero;
3028
0
                    bidir[1].mv = mvzero;
3029
0
                    bidir[0].mvp = mvp0;
3030
0
                    bidir[1].mvp = mvp1;
3031
0
                    bidir[0].mvpIdx = mvpIdx0;
3032
0
                    bidir[1].mvpIdx = mvpIdx1;
3033
0
                    bidirCost = cost;
3034
0
                    bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
3035
0
                }
3036
0
            }
3037
0
        }
3038
3039
0
        uint32_t bestCost = MAX_INT;
3040
0
        bool isMerge = false;
3041
0
        bool isBidir = false;
3042
0
        bool uniL0 = false;
3043
0
        bool uniL1 = false;
3044
3045
0
        if (useThreadedME)
3046
0
        {
3047
0
            bestME[0].ref = threadedMEData.ref[0];
3048
0
            bestME[1].ref = threadedMEData.ref[1];
3049
3050
0
            isBidir = threadedBidir;
3051
0
            uniL0 = threadedUniL0;
3052
0
            uniL1 = threadedUniL1;
3053
3054
0
            if (isBidir)
3055
0
            {
3056
0
                cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc);
3057
0
                cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc);
3058
3059
0
                bidir[0].mv = threadedMEData.mv[0];
3060
0
                bidir[1].mv = threadedMEData.mv[1];
3061
0
                bidir[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
3062
0
                bidir[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
3063
0
                bidir[0].mvCost = threadedMEData.mvCost[0];
3064
0
                bidir[1].mvCost = threadedMEData.mvCost[1];
3065
0
                bidirCost = threadedMEData.cost;
3066
0
                bidirBits = threadedMEData.bits;
3067
3068
0
                bestCost = bidirCost;
3069
0
            }
3070
0
            else if (uniL0)
3071
0
            {
3072
0
                cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc);
3073
3074
0
                bestME[0].mv = threadedMEData.mv[0];
3075
0
                bestME[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
3076
0
                bestME[0].mvCost = threadedMEData.mvCost[0];
3077
0
                bestME[0].cost = threadedMEData.cost;
3078
0
                bestME[0].bits = threadedMEData.bits;
3079
3080
0
                bestCost = bestME[0].cost;
3081
0
            }
3082
0
            else if (uniL1)
3083
0
            {
3084
0
                cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc);
3085
3086
0
                bestME[1].mv = threadedMEData.mv[1];
3087
0
                bestME[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
3088
0
                bestME[1].mvCost = threadedMEData.mvCost[1];
3089
0
                bestME[1].cost = threadedMEData.cost;
3090
0
                bestME[1].bits = threadedMEData.bits;
3091
3092
0
                bestCost = bestME[1].cost;
3093
0
            }
3094
3095
0
            if (mrgCost < bestCost)
3096
0
                isMerge = true;
3097
0
        }
3098
3099
        /* select best option and store into CU */
3100
0
        if ((mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) || isMerge)
3101
0
        {
3102
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
3103
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
3104
0
            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
3105
0
            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
3106
0
            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
3107
0
            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
3108
0
            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
3109
3110
0
            totalmebits += merge.bits;
3111
0
        }
3112
0
        else if ((bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) || isBidir)
3113
0
        {
3114
0
            lastMode = 2;
3115
3116
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3117
0
            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
3118
0
            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
3119
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
3120
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
3121
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
3122
3123
0
            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
3124
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
3125
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
3126
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
3127
3128
0
            totalmebits += bidirBits;
3129
0
        }
3130
0
        else if ((bestME[0].cost <= bestME[1].cost) || uniL0)
3131
0
        {
3132
0
            lastMode = 0;
3133
3134
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3135
0
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3136
0
            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
3137
0
            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
3138
0
            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
3139
0
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
3140
3141
0
            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3142
0
            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
3143
3144
0
            totalmebits += bestME[0].bits;
3145
0
        }
3146
0
        else
3147
0
        {
3148
0
            lastMode = 1;
3149
3150
0
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
3151
0
            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
3152
0
            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
3153
0
            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
3154
0
            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
3155
0
            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
3156
3157
0
            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
3158
0
            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
3159
3160
0
            totalmebits += bestME[1].bits;
3161
0
        }
3162
3163
0
        motionCompensation(cu, pu, *predYuv, true, bChromaMC);
3164
0
    }
3165
0
    interMode.sa8dBits += totalmebits;
3166
0
}
3167
3168
#if ENABLE_SCC_EXT
3169
uint32_t Search::getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height)
3170
{
3171
    uint32_t dist = 0;
3172
3173
    for (int i = 0; i < height; i++)
3174
    {
3175
        for (int j = 0; j < width; j++)
3176
        {
3177
            dist += abs(ref[j] - curr[j]);
3178
        }
3179
        ref += refStride;
3180
        curr += currStride;
3181
    }
3182
    return dist;
3183
}
3184
3185
int Search::intraBCSearchMVChromaRefine(Mode& intraBCMode,
3186
    const CUGeom& cuGeom,
3187
    int         roiWidth,
3188
    int         roiHeight,
3189
    int         cuPelX,
3190
    int         cuPelY,
3191
    uint32_t* sadBestCand,
3192
    MV* MVCand,
3193
    uint32_t    partOffset,
3194
    int         puIdx
3195
)
3196
{
3197
    int bestCandIdx = 0;
3198
    uint32_t  sadBest = UINT_MAX;
3199
    uint32_t  tempSad;
3200
3201
    pixel* ref;
3202
    const pixel* picOrg;
3203
    int refStride, orgStride;
3204
    int width, height;
3205
3206
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3207
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3208
3209
    CUData& cu = intraBCMode.cu;
3210
    Yuv& tmpPredYuv = intraBCMode.predYuv;
3211
    PredictionUnit pu(cu, cuGeom, puIdx);
3212
3213
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3214
    {
3215
        if ((!MVCand[cand].x) && (!MVCand[cand].y))
3216
        {
3217
            continue;
3218
        }
3219
3220
        if (((int)(cuPelY + MVCand[cand].y + roiHeight) >= picHeight) || ((cuPelY + MVCand[cand].y) < 0))
3221
        {
3222
            continue;
3223
        }
3224
3225
        if (((int)(cuPelX + MVCand[cand].x + roiWidth) >= picWidth) || ((cuPelX + MVCand[cand].x) < 0))
3226
        {
3227
            continue;
3228
        }
3229
3230
        tempSad = sadBestCand[cand];
3231
        int bitDepths = m_param->sourceBitDepth;
3232
        MV mvQuaterPixl = MVCand[cand];
3233
        mvQuaterPixl <<= 2;
3234
        cu.setPUMv(0, mvQuaterPixl, pu.puAbsPartIdx, puIdx);
3235
        cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
3236
        cu.setPUMv(1, MV(), pu.puAbsPartIdx, puIdx);
3237
        cu.setPURefIdx(1, -1, pu.puAbsPartIdx, puIdx);
3238
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
3239
3240
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
3241
3242
        for (uint32_t ch = TEXT_CHROMA_U; ch < MAX_NUM_COMPONENT; ch++)
3243
        {
3244
            ref = m_slice->m_refFrameList[0][m_slice->m_numRefIdx[0] - 1]->m_reconPic[1]->getChromaAddr(ch, cu.m_cuAddr, cu.m_absIdxInCTU + partOffset);
3245
3246
            picOrg = intraBCMode.fencYuv->getChromaAddr(ch, partOffset);
3247
            orgStride = intraBCMode.fencYuv->m_csize;
3248
3249
            refStride = m_frame->m_reconPic[1]->m_strideC;
3250
3251
            width = roiWidth >> m_hChromaShift;
3252
            height = roiHeight >> m_vChromaShift;
3253
3254
            ref = tmpPredYuv.getChromaAddr(ch, partOffset);
3255
            refStride = tmpPredYuv.m_csize;
3256
3257
            for (int row = 0; row < height; row++)
3258
            {
3259
                for (int col = 0; col < width; col++)
3260
                {
3261
                    tempSad += ((abs(ref[col] - picOrg[col])) >> (bitDepths - 8));
3262
                }
3263
                ref += refStride;
3264
                picOrg += orgStride;
3265
            }
3266
        }
3267
3268
        if (tempSad < sadBest)
3269
        {
3270
            sadBest = tempSad;
3271
            bestCandIdx = cand;
3272
        }
3273
    }
3274
3275
    return bestCandIdx;
3276
}
3277
3278
void Search::updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc)
3279
{
3280
    if (roiWidth + roiHeight > 8)
3281
    {
3282
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvCand, CHROMA_REFINEMENT_CANDIDATES, false);
3283
3284
        if (roiWidth + roiHeight == 32)
3285
        {
3286
            ibc.m_numBV16s = ibc.m_numBVs;
3287
        }
3288
    }
3289
}
3290
3291
void Search::intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* MVCand)
3292
{
3293
    int j = CHROMA_REFINEMENT_CANDIDATES - 1;
3294
3295
    if (sad < sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3296
    {
3297
        for (int t = CHROMA_REFINEMENT_CANDIDATES - 1; t >= 0; t--)
3298
        {
3299
            if (sad < sadBestCand[t])
3300
            {
3301
                j = t;
3302
            }
3303
        }
3304
3305
        for (int k = CHROMA_REFINEMENT_CANDIDATES - 1; k > j; k--)
3306
        {
3307
            sadBestCand[k] = sadBestCand[k - 1];
3308
3309
            MVCand[k].set(MVCand[k - 1].x, MVCand[k - 1].y);
3310
        }
3311
        sadBestCand[j] = sad;
3312
        MVCand[j].set(x, y);
3313
    }
3314
}
3315
3316
uint32_t Search::mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel)
3317
{
3318
    for (uint32_t cand = 0; cand < sn && dn < SCM_S0067_NUM_CANDIDATES; cand++)
3319
    {
3320
        bool found = false;
3321
        MV TempMv = src[cand];
3322
        if (!isSrcQuarPel)
3323
        {
3324
            TempMv <<= 2;
3325
        }
3326
        for (uint32_t j = 0; j < dn; j++)
3327
        {
3328
            if (TempMv == dst[j])
3329
            {
3330
                found = true;
3331
                break;
3332
            }
3333
        }
3334
3335
        if (!found)
3336
        {
3337
            dst[dn] = TempMv;
3338
            dn++;
3339
        }
3340
    }
3341
    return dn;
3342
}
3343
3344
void Search::restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours, uint32_t numValidMergeCand)
3345
{
3346
    {
3347
        for (uint32_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand)
3348
        {
3349
            if (interDirNeighbours[mergeCand] == 3)
3350
            {
3351
                bool b8x8BiPredRestricted = cu->is8x8BipredRestriction(
3352
                    mvFieldNeighbours[mergeCand][0].mv,
3353
                    mvFieldNeighbours[mergeCand][1].mv,
3354
                    mvFieldNeighbours[mergeCand][0].refIdx,
3355
                    mvFieldNeighbours[mergeCand][1].refIdx);
3356
3357
                int width = 0;
3358
                int height = 0;
3359
                uint32_t partAddr;
3360
3361
                cu->getPartIndexAndSize(puIdx, partAddr, width, height);
3362
                if (b8x8BiPredRestricted)
3363
                {
3364
                    if (width <= 8 && height <= 8)
3365
                    {
3366
                        interDirNeighbours[mergeCand] = 1;
3367
                        mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
3368
                    }
3369
                }
3370
                else if (cu->isBipredRestriction())
3371
                {
3372
                    interDirNeighbours[mergeCand] = 1;
3373
                    mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
3374
                }
3375
            }
3376
        }
3377
    }
3378
}
3379
3380
bool Search::isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* cu,
3381
    int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize)
3382
{
3383
    static const int s_floorLog2[65] =
3384
    {
3385
      -1, 0, 1, 1, 2, 2, 2, 2, 3, 3,
3386
       3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
3387
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
3388
       4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
3389
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3390
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
3391
       5, 5, 5, 5, 6
3392
    };
3393
3394
    int ctuSizeLog2 = s_floorLog2[ctuSize];
3395
    int interpolationSamplesX = (cu->m_chromaFormat == X265_CSP_I422 || cu->m_chromaFormat == X265_CSP_I420) ? ((xBv & 0x1) << 1) : 0;
3396
    int interpolationSamplesY = (cu->m_chromaFormat == X265_CSP_I420) ? ((yBv & 0x1) << 1) : 0;
3397
    int refRightX = xPos + xBv + width - 1 + interpolationSamplesX;
3398
    int refBottomY = yPos + yBv + height - 1 + interpolationSamplesY;
3399
    int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3400
    int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3401
3402
    if ((xPos + xBv - interpolationSamplesX) < 0)
3403
        return false;
3404
    if (refRightX >= picWidth)
3405
        return false;
3406
    if ((yPos + yBv - interpolationSamplesY) < 0)
3407
        return false;
3408
    if (refBottomY >= picHeight)
3409
        return false;
3410
3411
    if ((xBv + width + interpolationSamplesX) > 0 && (yBv + height + interpolationSamplesY) > 0)
3412
        return false;
3413
3414
    if (refBottomY >> ctuSizeLog2 < yPos >> ctuSizeLog2)
3415
    {
3416
        int refCuX = refRightX / ctuSize;
3417
        int refCuY = refBottomY / ctuSize;
3418
        int cuPelX = xPos / ctuSize;
3419
        int cuPelY = yPos / ctuSize;
3420
3421
        if (((int)(refCuX - cuPelX) > (int)((cuPelY - refCuY))))
3422
            return false;
3423
        else
3424
            return true;
3425
    }
3426
3427
    if (refBottomY >> ctuSizeLog2 > yPos >> ctuSizeLog2)
3428
    {
3429
        return false;
3430
    }
3431
3432
    // in the same CTU line
3433
    if (refRightX >> ctuSizeLog2 < xPos >> ctuSizeLog2)
3434
        return true;
3435
    if (refRightX >> ctuSizeLog2 > xPos >> ctuSizeLog2)
3436
        return false;
3437
3438
    // same CTU
3439
    int mask = 1 << ctuSizeLog2;
3440
    mask -= 1;
3441
    int rasterCurr = ((((yPos & mask) - yStartInCU) >> 2) << (ctuSizeLog2 - 2)) + (((xPos & mask) - xStartInCU) >> 2);
3442
    int rasterRef = (((refBottomY & mask) >> 2) << (ctuSizeLog2 - 2)) + ((refRightX & mask) >> 2);
3443
3444
    if (g_rasterToZscan[rasterRef] >= g_rasterToZscan[rasterCurr])
3445
        return false;
3446
    return true;
3447
}
3448
3449
bool Search::isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset)
3450
{
3451
    const int  cuPelX = cu->m_cuPelX + g_zscanToPelX[partOffset];
3452
    const int  cuPelY = cu->m_cuPelY + g_zscanToPelY[partOffset];
3453
3454
    if (!isBlockVectorValid(cuPelX, cuPelY, roiWidth, roiHeight, cu, g_zscanToPelX[partOffset], g_zscanToPelY[partOffset], predX, predY, m_param->maxCUSize))
3455
    {
3456
        return false;
3457
    }
3458
    return true;
3459
}
3460
3461
void Search::intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB,
3462
    MV& mv, uint32_t& cost, int roiWidth, int roiHeight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
3463
{
3464
    const int   srchRngHorLeft = searchRangeLT->x;
3465
    const int   srchRngHorRight = searchRangeRB->x;
3466
    const int   srchRngVerTop = searchRangeLT->y;
3467
    const int   srchRngVerBottom = searchRangeRB->y;
3468
3469
    CUData& cu = intraBCMode.cu;
3470
    const uint32_t  lcuWidth = m_param->maxCUSize;
3471
    const uint32_t  lcuHeight = m_param->maxCUSize;
3472
    const int       puPelOffsetX = g_zscanToPelX[partAddr];
3473
    const int       puPelOffsetY = g_zscanToPelY[partAddr];
3474
    const int       cuPelX = cu.m_cuPelX + puPelOffsetX;  // Point to the location of PU
3475
    const int       cuPelY = cu.m_cuPelY + puPelOffsetY;
3476
3477
    uint32_t  sad = 0;
3478
    uint32_t  sadBest = UINT_MAX;
3479
    int         bestX = 0;
3480
    int         bestY = 0;
3481
    pixel* refSrch;
3482
3483
    int         bestCandIdx = 0;
3484
    uint32_t    partOffset = 0;
3485
    MV          MVCand[CHROMA_REFINEMENT_CANDIDATES];
3486
    uint32_t    sadBestCand[CHROMA_REFINEMENT_CANDIDATES];
3487
3488
    partOffset = partAddr;
3489
    PredictionUnit pu(cu, cuGeom, puIdx);
3490
    for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++)
3491
    {
3492
        sadBestCand[cand] = UINT_MAX;
3493
        MVCand[cand].set(0, 0);
3494
    }
3495
3496
    const int         relCUPelX = cuPelX % lcuWidth;
3497
    const int         relCUPelY = cuPelY % lcuHeight;
3498
    const int chromaROIWidthInPixels = roiWidth;
3499
    const int chromaROIHeightInPixels = roiHeight;
3500
    bool fastsearch = (m_param->bEnableSCC == 1) ? true : false;
3501
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3502
3503
    if (fastsearch)
3504
    {
3505
        uint32_t tempSadBest = 0;
3506
        int srLeft = srchRngHorLeft, srRight = srchRngHorRight, srTop = srchRngVerTop, srBottom = srchRngVerBottom;
3507
        const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3508
        const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3509
3510
        if (isFullFrameSearchrangeEnabled)//full frame search
3511
        {
3512
            srLeft = -1 * cuPelX;
3513
            srTop = -1 * cuPelY;
3514
3515
            srRight = picWidth - cuPelX - roiWidth;
3516
            srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3517
3518
            if (cuPelX + srRight + roiWidth > (int)picWidth)
3519
            {
3520
                srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3521
            }
3522
            if (cuPelY + srBottom + roiHeight > (int)picHeight)
3523
            {
3524
                srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
3525
            }
3526
        }
3527
3528
        if (roiWidth > 8 || roiHeight > 8)
3529
            ibc.m_numBVs = 0;
3530
        else if (roiWidth + roiHeight == 16)
3531
            ibc.m_numBVs = ibc.m_numBV16s;
3532
        if (testOnlyPred)
3533
            ibc.m_numBVs = 0;
3534
3535
        MV  mvPredEncOnly[16];
3536
        int nbPreds = 0;
3537
        cu.getIntraBCMVPsEncOnly(partAddr, mvPredEncOnly, nbPreds, puIdx);
3538
        ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvPredEncOnly, nbPreds, true);
3539
3540
        for (int cand = 0; cand < ibc.m_numBVs; cand++)
3541
        {
3542
            int xPred = ibc.m_BVs[cand].x >> 2;
3543
            int yPred = ibc.m_BVs[cand].y >> 2;
3544
            if (!(xPred == 0 && yPred == 0) && !((yPred < srTop) || (yPred > srBottom)) && !((xPred < srLeft) || (xPred > srRight)))
3545
            {
3546
                int tempY = yPred + relCUPelY + roiHeight - 1;
3547
                int tempX = xPred + relCUPelX + roiWidth - 1;
3548
                bool validCand = isValidIntraBCSearchArea(&cu, xPred, yPred, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset);
3549
3550
                if ((tempX >= (int)lcuWidth) && (tempY >= 0) && isFullFrameSearchrangeEnabled)
3551
                    validCand = false;
3552
3553
                if ((tempX >= 0) && (tempY >= 0))
3554
                {
3555
                    int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3556
                    uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3557
                    if (tempZscanIdx >= cu.m_absIdxInCTU)
3558
                    {
3559
                        validCand = false;
3560
                    }
3561
                }
3562
3563
                if (validCand)
3564
                {
3565
                    sad = m_me.mvcost(ibc.m_BVs[cand]);
3566
3567
                    refSrch = refY + yPred * refStride + xPred;
3568
3569
                    sad += m_me.bufSAD(refSrch, refStride);
3570
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3571
                    {
3572
                        continue;
3573
                    }
3574
3575
                    intraBCSearchMVCandUpdate(sad, xPred, yPred, sadBestCand, MVCand);
3576
                }
3577
            }
3578
        }
3579
        bestX = MVCand[0].x;
3580
        bestY = MVCand[0].y;
3581
        mv.set(bestX, bestY);
3582
        sadBest = sadBestCand[0];
3583
3584
        if (testOnlyPred)
3585
        {
3586
            cost = sadBest;
3587
            return;
3588
        }
3589
3590
        const int boundY = (0 - roiHeight - puPelOffsetY);
3591
        int lowY = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3592
            ? -cuPelY : X265_MAX(srchRngVerTop, 0 - cuPelY);
3593
        for (int y = boundY; y >= lowY; y--)
3594
        {
3595
            if (!isValidIntraBCSearchArea(&cu, 0, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3596
            {
3597
                continue;
3598
            }
3599
3600
            sad = m_me.mvcost(MV(0, y));
3601
3602
            refSrch = refY + y * refStride;
3603
3604
            sad += m_me.bufSAD(refSrch, refStride);
3605
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3606
            {
3607
                continue;
3608
            }
3609
3610
            intraBCSearchMVCandUpdate(sad, 0, y, sadBestCand, MVCand);
3611
            tempSadBest = sadBestCand[0];
3612
            if (sadBestCand[0] <= 3)
3613
            {
3614
                bestX = MVCand[0].x;
3615
                bestY = MVCand[0].y;
3616
                sadBest = sadBestCand[0];
3617
                mv.set(bestX, bestY);
3618
                cost = sadBest;
3619
3620
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3621
                return;
3622
            }
3623
        }
3624
3625
        const int boundX = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled)
3626
            ? -cuPelX : X265_MAX(srchRngHorLeft, -cuPelX);
3627
        for (int x = 0 - roiWidth - puPelOffsetX; x >= boundX; --x)
3628
        {
3629
            if (!isValidIntraBCSearchArea(&cu, x, 0, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3630
            {
3631
                continue;
3632
            }
3633
3634
            sad = m_me.mvcost(MV(x, 0));
3635
3636
            refSrch = refY + x;
3637
            sad += m_me.bufSAD(refSrch, refStride);
3638
3639
            if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3640
            {
3641
                continue;
3642
            }
3643
3644
            intraBCSearchMVCandUpdate(sad, x, 0, sadBestCand, MVCand);
3645
            tempSadBest = sadBestCand[0];
3646
            if (sadBestCand[0] <= 3)
3647
            {
3648
                bestX = MVCand[0].x;
3649
                bestY = MVCand[0].y;
3650
                sadBest = sadBestCand[0];
3651
                mv.set(bestX, bestY);
3652
                cost = sadBest;
3653
3654
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3655
                return;
3656
            }
3657
        }
3658
3659
        bestX = MVCand[0].x;
3660
        bestY = MVCand[0].y;
3661
        sadBest = sadBestCand[0];
3662
3663
        if ((!bestX && !bestY) || (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 32))
3664
        {
3665
            //chroma refine
3666
            bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3667
            bestX = MVCand[bestCandIdx].x;
3668
            bestY = MVCand[bestCandIdx].y;
3669
            sadBest = sadBestCand[bestCandIdx];
3670
            mv.set(bestX, bestY);
3671
            cost = sadBest;
3672
3673
            updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3674
            return;
3675
        }
3676
3677
        if (cuGeom.depth > 2 && !bUse1DSearchFor8x8)
3678
        {
3679
            for (int y = X265_MAX(srchRngVerTop, -cuPelY); y <= srchRngVerBottom; y += 2)
3680
            {
3681
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3682
                {
3683
                    continue;
3684
                }
3685
3686
                int tempY = y + relCUPelY + roiHeight - 1;
3687
3688
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x++)
3689
                {
3690
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3691
                    {
3692
                        continue;
3693
                    }
3694
3695
                    int tempX = x + relCUPelX + roiWidth - 1;
3696
3697
                    if ((tempX >= 0) && (tempY >= 0))
3698
                    {
3699
                        int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3700
                        uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3701
                        if (iTempZscanIdx >= cu.m_absIdxInCTU)
3702
                        {
3703
                            continue;
3704
                        }
3705
                    }
3706
3707
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3708
                    {
3709
                        continue;
3710
                    }
3711
3712
                    sad = m_me.mvcost(MV(x, y));
3713
3714
                    refSrch = refY + y * refStride + x;
3715
                    sad += m_me.bufSAD(refSrch, refStride);
3716
3717
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3718
                }
3719
            }
3720
3721
            bestX = MVCand[0].x;
3722
            bestY = MVCand[0].y;
3723
            sadBest = sadBestCand[0];
3724
            if (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 16)
3725
            {
3726
                //chroma refine
3727
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3728
                bestX = MVCand[bestCandIdx].x;
3729
                bestY = MVCand[bestCandIdx].y;
3730
                sadBest = sadBestCand[bestCandIdx];
3731
                mv.set(bestX, bestY);
3732
                cost = sadBest;
3733
3734
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3735
                return;
3736
            }
3737
3738
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3739
            {
3740
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3741
                {
3742
                    continue;
3743
                }
3744
3745
                int tempY = y + relCUPelY + roiHeight - 1;
3746
3747
                for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x += 2)
3748
                {
3749
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3750
                    {
3751
                        continue;
3752
                    }
3753
3754
                    int tempX = x + relCUPelX + roiWidth - 1;
3755
3756
                    if ((tempX >= 0) && (tempY >= 0))
3757
                    {
3758
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3759
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3760
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3761
                        {
3762
                            continue;
3763
                        }
3764
                    }
3765
3766
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3767
                    {
3768
                        continue;
3769
                    }
3770
3771
                    sad = m_me.mvcost(MV(x, y));
3772
3773
                    refSrch = refY + y * refStride + x;
3774
                    sad += m_me.bufSAD(refSrch, refStride);
3775
3776
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3777
                    {
3778
                        continue;
3779
                    }
3780
3781
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3782
                    if (sadBestCand[0] <= 5)
3783
                    {
3784
                        //chroma refine & return
3785
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3786
                        bestX = MVCand[bestCandIdx].x;
3787
                        bestY = MVCand[bestCandIdx].y;
3788
                        sadBest = sadBestCand[bestCandIdx];
3789
                        mv.set(bestX, bestY);
3790
                        cost = sadBest;
3791
3792
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3793
                        return;
3794
                    }
3795
                }
3796
            }
3797
3798
            bestX = MVCand[0].x;
3799
            bestY = MVCand[0].y;
3800
            sadBest = sadBestCand[0];
3801
3802
            if ((sadBest >= tempSadBest) || ((sadBest - m_me.mvcost(MV(bestX, bestY))) <= 32))
3803
            {
3804
                //chroma refine
3805
                bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3806
                bestX = MVCand[bestCandIdx].x;
3807
                bestY = MVCand[bestCandIdx].y;
3808
                sadBest = sadBestCand[bestCandIdx];
3809
                mv.set(bestX, bestY);
3810
                cost = sadBest;
3811
3812
                updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3813
                return;
3814
            }
3815
3816
            tempSadBest = sadBestCand[0];
3817
3818
3819
            for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2)
3820
            {
3821
                if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3822
                {
3823
                    continue;
3824
                }
3825
3826
                int tempY = y + relCUPelY + roiHeight - 1;
3827
3828
                for (int x = (X265_MAX(srchRngHorLeft, -cuPelX) + 1); x <= srchRngHorRight; x += 2)
3829
                {
3830
3831
                    if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3832
                    {
3833
                        continue;
3834
                    }
3835
3836
                    int tempX = x + relCUPelX + roiWidth - 1;
3837
3838
                    if ((tempX >= 0) && (tempY >= 0))
3839
                    {
3840
                        int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3841
                        uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx];
3842
                        if (tempZscanIdx >= cu.m_absIdxInCTU)
3843
                        {
3844
                            continue;
3845
                        }
3846
                    }
3847
3848
                    if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3849
                    {
3850
                        continue;
3851
                    }
3852
3853
                    sad = m_me.mvcost(MV(x, y));
3854
3855
                    refSrch = refY + y * refStride + x;
3856
                    sad += m_me.bufSAD(refSrch, refStride);
3857
                    if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1])
3858
                    {
3859
                        continue;
3860
                    }
3861
3862
                    intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3863
                    if (sadBestCand[0] <= 5)
3864
                    {
3865
                        //chroma refine & return
3866
                        bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3867
                        bestX = MVCand[bestCandIdx].x;
3868
                        bestY = MVCand[bestCandIdx].y;
3869
                        sadBest = sadBestCand[bestCandIdx];
3870
                        mv.set(bestX, bestY);
3871
                        cost = sadBest;
3872
3873
                        updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3874
                        return;
3875
                    }
3876
                }
3877
            }
3878
        }
3879
    }
3880
    else //full search
3881
    {
3882
        refY += (srchRngVerBottom * refStride);
3883
        int picWidth = m_slice->m_sps->picWidthInLumaSamples;
3884
        int picHeight = m_slice->m_sps->picHeightInLumaSamples;
3885
3886
        for (int y = srchRngVerBottom; y >= srchRngVerTop; y--)
3887
        {
3888
            if (((int)(cuPelY + y) < 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight))
3889
            {
3890
                refY -= refStride;
3891
                continue;
3892
            }
3893
3894
            for (int x = srchRngHorLeft; x <= srchRngHorRight; x++)
3895
            {
3896
3897
                if (((int)(cuPelX + x) < 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth))
3898
                {
3899
                    continue;
3900
                }
3901
3902
                int tempX = x + relCUPelX + roiWidth - 1;
3903
                int tempY = y + relCUPelY + roiHeight - 1;
3904
                if ((tempX >= 0) && (tempY >= 0))
3905
                {
3906
                    int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4);
3907
                    uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx];
3908
                    if (iTempZscanIdx >= cu.m_absIdxInCTU)
3909
                    {
3910
                        continue;
3911
                    }
3912
                }
3913
3914
                if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset))
3915
                {
3916
                    continue;
3917
                }
3918
3919
                refSrch = refY + x;
3920
3921
                sad = m_me.bufSAD(refSrch, refStride);
3922
                sad += m_me.mvcost(MV(x, y));
3923
                if (sad < sadBest)
3924
                {
3925
                    sadBest = sad;
3926
                    bestX = x;
3927
                    bestY = y;
3928
                }
3929
                intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand);
3930
            }
3931
3932
            refY -= refStride;
3933
        }
3934
    }
3935
3936
    bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx);
3937
    bestX = MVCand[bestCandIdx].x;
3938
    bestY = MVCand[bestCandIdx].y;
3939
    sadBest = sadBestCand[bestCandIdx];
3940
    mv.set(bestX, bestY);
3941
    cost = sadBest;
3942
3943
    updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc);
3944
3945
}
3946
3947
void Search::setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB)
3948
{
3949
    MV mvPred = pred;
3950
    CUData& cu = intraBCMode.cu;
3951
    cu.clipMv(mvPred);
3952
    int srLeft, srRight, srTop, srBottom;
3953
    int puWidth, puHeight;
3954
    uint32_t partAddr;
3955
3956
    cu.getPartIndexAndSize(puIdx, partAddr, puWidth, puHeight);
3957
3958
    const uint32_t lcuWidth = m_param->maxCUSize;
3959
    const uint32_t lcuHeight = m_param->maxCUSize;
3960
    const uint32_t cuPelX = cu.m_cuPelX + g_zscanToPelX[partAddr];
3961
    const uint32_t cuPelY = cu.m_cuPelY + g_zscanToPelY[partAddr];
3962
3963
    const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples;
3964
    const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples;
3965
    bool  isFullFrameSearchrangeEnabled = false; // disabled by default
3966
    if (1 << cu.m_log2CUSize[0] == 16 && cu.m_partSize[0] == SIZE_2Nx2N && isFullFrameSearchrangeEnabled)// full frame search
3967
    {
3968
        srLeft = -1 * cuPelX;
3969
        srTop = -1 * cuPelY;
3970
3971
        srRight = picWidth - cuPelX - roiWidth;
3972
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3973
    }
3974
    else
3975
    {
3976
        const uint32_t searchWidthInCTUs = 1 << cu.m_log2CUSize[0] == 8 ? 1 : (isFullFrameSearchrangeEnabled) ? -1 : 1;
3977
        uint32_t width = 0, maxWidth = searchWidthInCTUs * lcuWidth;
3978
        for (const CUData* pTestCU = cu.m_cuLeft;
3979
            width < maxWidth && pTestCU != NULL && pTestCU->m_slice != NULL;
3980
            pTestCU = pTestCU->m_cuLeft, width += lcuWidth)
3981
        {
3982
        }
3983
        int maxXsr = (cuPelX % lcuWidth) + X265_MIN(maxWidth, width);
3984
        int maxYsr = cuPelY % lcuHeight;
3985
3986
        if (cu.m_chromaFormat == X265_CSP_I420 || cu.m_chromaFormat == X265_CSP_I422) maxXsr &= ~0x4;
3987
        if (cu.m_chromaFormat == X265_CSP_I420)                                       maxYsr &= ~0x4;
3988
3989
        srLeft = -maxXsr;
3990
        srTop = -maxYsr;
3991
3992
        srRight = lcuWidth - cuPelX % lcuWidth - roiWidth;
3993
        srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight;
3994
    }
3995
3996
    if (cuPelX + srRight + roiWidth > picWidth)
3997
    {
3998
        srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth;
3999
    }
4000
    if (cuPelY + srBottom + roiHeight > picHeight)
4001
    {
4002
        srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight;
4003
    }
4004
4005
    searchRangeLT.x = srLeft;
4006
    searchRangeLT.y = srTop;
4007
    searchRangeRB.x = srRight;
4008
    searchRangeRB.y = srBottom;
4009
4010
    cu.clipMv(searchRangeLT);
4011
    cu.clipMv(searchRangeRB);
4012
4013
}
4014
4015
void Search::intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
4016
{
4017
    uint32_t         partAddr;
4018
    int              roiWidth;
4019
    int              roiHeight;
4020
4021
    MV   searchRangeLT;
4022
    MV   searchRangeRB;
4023
    MV   mvPred = *pred;
4024
    const MV predictors = *pred;
4025
4026
    CUData& cu = intraBCMode.cu;
4027
    cu.getPartIndexAndSize(puIdx, partAddr, roiWidth, roiHeight);
4028
4029
    int ref = m_slice->m_numRefIdx[0] - 1;
4030
    pixel* refY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->getLumaAddr(cu.m_cuAddr, cu.m_absIdxInCTU + partAddr);
4031
    int  strideY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->m_stride;
4032
4033
    setIntraSearchRange(intraBCMode, mvPred, puIdx, roiWidth, roiHeight, searchRangeLT, searchRangeRB);
4034
4035
    m_me.setMVP(predictors);
4036
4037
    intraPatternSearch(intraBCMode, cuGeom, puIdx, partAddr, refY, strideY, &searchRangeLT, &searchRangeRB, mv, cost, roiWidth, roiHeight, testOnlyPred, bUse1DSearchFor8x8, ibc);
4038
}
4039
4040
bool Search::predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc)
4041
{
4042
    MV zeroMv(0, 0);
4043
    CUData& cu = intraBCMode.cu;
4044
    Yuv* predYuv = &intraBCMode.predYuv;
4045
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
4046
    int  numPart = cu.getNumPartInter(0);
4047
    int log2ParallelMergeLevelMinus2 = 0;
4048
4049
    // 12 mv candidates including lowresMV
4050
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
4051
4052
    if (m_param->bEnableSCC == 1 && (1 << cu.m_log2CUSize[0]) > SCM_S0067_MAX_CAND_SIZE) // fast search
4053
        return false;
4054
4055
    uint32_t totalCost = 0;
4056
    for (int puIdx = 0; puIdx < numPart; puIdx++)
4057
    {
4058
        int width, height;
4059
        uint32_t partAddr = 0;
4060
        MotionData* bestME = intraBCMode.bestME[puIdx];
4061
        PredictionUnit pu(cu, cuGeom, puIdx);
4062
        MV  mv, mvPred[2];
4063
        cu.getPartIndexAndSize(puIdx, pu.puAbsPartIdx, width, height);
4064
        partAddr = pu.puAbsPartIdx;
4065
        m_me.setSourcePU(*intraBCMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
4066
4067
        cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, intraBCMode.interNeighbours);
4068
        cu.getPMV(intraBCMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, puIdx, pu.puAbsPartIdx);
4069
4070
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
4071
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
4072
4073
        uint32_t cost;
4074
        mv.set(0, 0);
4075
        intraBlockCopyEstimate(intraBCMode, cuGeom, puIdx, mvPred, mv, cost, testOnlyPred, bUse1DSearchFor8x8, ibc);
4076
4077
        bestME->mv.set(mv.x << 2, mv.y << 2);
4078
        bestME->cost = cost;
4079
        totalCost += cost;
4080
        if (mv.x == 0 && mv.y == 0)
4081
        {
4082
            if (testOnlyPred)
4083
            {
4084
                m_lastCandCost = MAX_UINT;
4085
            }
4086
            return false;
4087
        }
4088
4089
        int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
4090
        int distAMVPBest, distMergeTemp;
4091
        int costAMVPBest, costMergeBest, costMergeTemp;
4092
        bitsAMVPBest = MAX_INT;
4093
        costAMVPBest = MAX_INT;
4094
        costMergeBest = MAX_INT;
4095
        int mvpIdxBest = 0;
4096
        int mvpIdxTemp;
4097
        int mrgIdxBest = -1;
4098
        int mrgIdxTemp = -1;
4099
        int xCUStart = cu.m_cuPelX;
4100
        int yCUStart = cu.m_cuPelY;
4101
        int xStartInCU = 0, yStartInCU = 0;
4102
        if (ePartSize == SIZE_2Nx2N)
4103
            xStartInCU = yStartInCU = 0;
4104
        else if (ePartSize == SIZE_2NxN)
4105
        {
4106
            xStartInCU = 0;
4107
            yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
4108
        }
4109
        else if (ePartSize == SIZE_Nx2N)
4110
        {
4111
            xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx;
4112
            yStartInCU = 0;
4113
        }
4114
        const pixel* currStart;
4115
        pixel* ref;
4116
        int currStride, refStride;
4117
        distAMVPBest = 0;
4118
4119
        MV cMvQuaterPixl = mv;
4120
        cMvQuaterPixl <<= 2;
4121
        cu.setPUMv(0, cMvQuaterPixl, pu.puAbsPartIdx, puIdx);
4122
        cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx);
4123
        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
4124
        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
4125
        cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
4126
        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4127
        for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4128
        {
4129
            int tempHeight, tempWidth;
4130
            if (ch == 0)
4131
            {
4132
                tempHeight = height;
4133
                tempWidth = width;
4134
                ref = tmpPredYuv.getLumaAddr(partAddr);
4135
                refStride = tmpPredYuv.m_size;
4136
                distAMVPBest += m_me.bufSAD(ref, refStride);
4137
            }
4138
            else
4139
            {
4140
                tempHeight = height >> m_vChromaShift;
4141
                tempWidth = width >> m_hChromaShift;
4142
4143
                currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
4144
                currStride = intraBCMode.fencYuv->m_csize;
4145
                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4146
                refStride = tmpPredYuv.m_csize;
4147
                distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4148
            }
4149
        }
4150
4151
        mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2);
4152
        mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2);
4153
4154
        for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4155
        {
4156
            m_me.setMVP(mvPred[mvpIdxTemp]);
4157
            bitsAMVPTemp = m_me.bitcost(mv, mvPred[mvpIdxTemp]);
4158
            if (bitsAMVPTemp < bitsAMVPBest)
4159
            {
4160
                bitsAMVPBest = bitsAMVPTemp;
4161
                mvpIdxBest = mvpIdxTemp;
4162
            }
4163
        }
4164
4165
        bitsAMVPBest++; // for MVP Index bits
4166
        costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4167
4168
        MVField cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4169
        uint8_t uhInterDirNeighbours[MRG_MAX_NUM_CANDS];
4170
        int numValidMergeCand = 0;
4171
4172
        for (int i = 0; i < MRG_MAX_NUM_CANDS; i++)
4173
        {
4174
            cMvFieldNeighbours[i][0].mv.set(0, 0);
4175
            cMvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
4176
        }
4177
4178
        if (ePartSize != SIZE_2Nx2N)
4179
        {
4180
            if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4181
            {
4182
                cu.setPartSizeSubParts(SIZE_2Nx2N);
4183
                if (puIdx == 0)
4184
                {
4185
                    numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, uhInterDirNeighbours);
4186
                }
4187
                cu.setPartSizeSubParts(ePartSize);
4188
            }
4189
            else
4190
            {
4191
                numValidMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, cMvFieldNeighbours, uhInterDirNeighbours);
4192
            }
4193
4194
            cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand);
4195
            restrictBipredMergeCand(&cu, puIdx, cMvFieldNeighbours, uhInterDirNeighbours, numValidMergeCand);
4196
4197
            for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCand; mrgIdxTemp++)
4198
            {
4199
                if (uhInterDirNeighbours[mrgIdxTemp] != 1)
4200
                {
4201
                    continue;
4202
                }
4203
                if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4204
                {
4205
                    continue;
4206
                }
4207
4208
                if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, width, height, &cu,
4209
                    xStartInCU, yStartInCU, (cMvFieldNeighbours[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighbours[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4210
                {
4211
                    continue;
4212
                }
4213
                bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4214
4215
                distMergeTemp = 0;
4216
4217
                cu.setPUMv(0, cMvFieldNeighbours[mrgIdxTemp][0].mv, pu.puAbsPartIdx, puIdx);
4218
                cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, puIdx);
4219
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx);
4220
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
4221
                cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
4222
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4223
4224
                for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4225
                {
4226
                    int tempHeight, tempWidth;
4227
                    if (ch == 0)
4228
                    {
4229
                        tempHeight = height;
4230
                        tempWidth = width;
4231
                        ref = tmpPredYuv.getLumaAddr(partAddr);
4232
                        refStride = tmpPredYuv.m_size;
4233
                        distMergeTemp += m_me.bufSAD(ref, refStride);
4234
                    }
4235
                    else
4236
                    {
4237
                        tempHeight = height >> m_vChromaShift;
4238
                        tempWidth = width >> m_hChromaShift;
4239
4240
                        currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr);
4241
                        currStride = intraBCMode.fencYuv->m_csize;
4242
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4243
                        refStride = tmpPredYuv.m_csize;
4244
                        distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4245
                    }
4246
                }
4247
                costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4248
4249
                if (costMergeTemp < costMergeBest)
4250
                {
4251
                    costMergeBest = costMergeTemp;
4252
                    mrgIdxBest = mrgIdxTemp;
4253
                }
4254
            }
4255
        }
4256
        if (costAMVPBest < costMergeBest)
4257
        {
4258
            MV tempmv((mv.x << 2), (mv.y << 2));
4259
            MVField mvField[2];
4260
            mvField[0].mv = tempmv;
4261
            mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4262
            mvField[1].mv = zeroMv;
4263
            mvField[1].refIdx = REF_NOT_VALID;
4264
4265
            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
4266
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
4267
4268
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
4269
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
4270
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
4271
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
4272
4273
            MV mvd;
4274
            mvd.set(mv.x - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].x >> 2), mv.y - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].y >> 2));
4275
4276
            cu.m_mvd[0][pu.puAbsPartIdx] = mvd;
4277
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mvpIdxBest;
4278
        }
4279
        else
4280
        {
4281
            MV MV(cMvFieldNeighbours[mrgIdxBest][0].mv.x, cMvFieldNeighbours[mrgIdxBest][0].mv.y);
4282
            MVField mvField[2];
4283
            mvField[0].mv = MV;
4284
            mvField[0].refIdx = cu.m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4285
            mvField[1].mv = zeroMv;
4286
            mvField[1].refIdx = REF_NOT_VALID;
4287
4288
            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4289
            cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mrgIdxBest; /* merge candidate ID is stored in L0 MVP idx */
4290
            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);  // list 0 prediction
4291
4292
            cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx);
4293
            cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
4294
            cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx);
4295
            cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
4296
4297
            cu.m_mvd[0][pu.puAbsPartIdx] = zeroMv;
4298
            cu.m_mvd[1][pu.puAbsPartIdx] = zeroMv;
4299
        }
4300
        motionCompensation(cu, pu, *predYuv, 1, 1);
4301
    }
4302
4303
    PredictionUnit pu(cu, cuGeom, 0);
4304
    uint32_t abortThreshold = (1 << cu.m_log2CUSize[0]) * (1 << cu.m_log2CUSize[0]) * 2;
4305
    if (testOnlyPred)
4306
    {
4307
        if (numPart == 1 && totalCost > abortThreshold)
4308
        {
4309
            m_lastCandCost = MAX_UINT;
4310
            return false;
4311
        }
4312
        m_lastCandCost = totalCost;
4313
    }
4314
    else if (totalCost < abortThreshold && 3 * totalCost >> 2 >= m_lastCandCost)
4315
    {
4316
        return false;
4317
    }
4318
    return true;
4319
}
4320
4321
bool Search::predMixedIntraBCInterSearch(Mode& intraBCMixedMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMvCandList)
4322
{
4323
    intraBCMixedMode.initCosts();
4324
    intraBCMixedMode.cu.setPartSizeSubParts(ePartSize);
4325
    intraBCMixedMode.cu.setPredModeSubParts(MODE_INTER);
4326
    CUData& cu = intraBCMixedMode.cu;
4327
    int numComb = 2;
4328
    int numPart = 2;
4329
    uint32_t cost[2] = { 0,0 };
4330
    uint32_t maxCost = UINT32_MAX;
4331
4332
    int      numPredDir = m_slice->isInterP() ? 1 : 2;
4333
    MV       cMvZero(0, 0);
4334
4335
    MV  cMvPredCand[2][2];
4336
    int IBCValidFlag = 0;
4337
    int bestIBCMvpIdx[2] = { 0, 0 };
4338
    int bestInterMvpIdx[2] = { 0, 0 };
4339
    int bestInterDir[2] = { 0, 0 };
4340
    int bestRefIdx[2] = { 0, 0 };
4341
    bool isMergeMode[2] = { false, false };
4342
    bool isIBCMergeMode[2] = { false, false };
4343
    MVField cMRGMvField[2][2];
4344
    MVField cMRGMvFieldIBC[2][2];
4345
    int log2ParallelMergeLevelMinus2 = 0;
4346
    // 12 mv candidates including lowresMV
4347
    MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
4348
4349
    Yuv* predYuv = &intraBCMixedMode.predYuv;
4350
    Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
4351
4352
    for (int combo = 0; combo < numComb; combo++) // number of combination
4353
    {
4354
        for (int partIdx = 0; partIdx < numPart; ++partIdx)
4355
        {
4356
            int dummyWidth, dummyHeight;
4357
            uint32_t partAddr = 0;
4358
            PredictionUnit pu(cu, cuGeom, partIdx);
4359
            cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth, dummyHeight);
4360
            m_me.setSourcePU(*intraBCMixedMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
4361
4362
            MV mvPred[2];
4363
            MV bvPred[2];
4364
            if ((combo == 0 && partIdx == 0) || (combo == 1 && partIdx == 1)) // intraBC
4365
            {
4366
                MV cMv = iMvCandList[8 + partIdx];
4367
                if (cMv.x == 0 && cMv.y == 0)
4368
                {
4369
                    cost[combo] = maxCost;
4370
                    IBCValidFlag++;
4371
                    break;
4372
                }
4373
4374
                cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4375
                cu.getPMV(intraBCMixedMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, partIdx, pu.puAbsPartIdx);
4376
4377
                bvPred[0] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0];
4378
                bvPred[1] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1];
4379
                bvPred[0] >>= 2;
4380
                bvPred[1] >>= 2;
4381
4382
                /////////////////////////////////////////////////////////////
4383
                // ibc merge
4384
                // choose one MVP and compare with merge mode
4385
4386
                int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp;
4387
                int distAMVPBest, distMergeTemp;
4388
                int costAMVPBest, costMergeBest, costMergeTemp;
4389
                bitsAMVPBest = MAX_INT;
4390
                costAMVPBest = MAX_INT;
4391
                costMergeBest = MAX_INT;
4392
                int mvpIdxBest = 0;
4393
                int mvpIdxTemp;
4394
                int mrgIdxBest = -1;
4395
                int mrgIdxTemp = -1;
4396
                int xCUStart = cu.m_cuPelX;
4397
                int yCUStart = cu.m_cuPelY;
4398
                int xStartInCU = 0, yStartInCU = 0;
4399
                if (ePartSize == SIZE_2Nx2N)
4400
                    xStartInCU = yStartInCU = 0;
4401
                else if (ePartSize == SIZE_2NxN)
4402
                {
4403
                    xStartInCU = 0;
4404
                    yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
4405
                }
4406
                else if (ePartSize == SIZE_Nx2N)
4407
                {
4408
                    xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx;
4409
                    yStartInCU = 0;
4410
                }
4411
                const pixel* currStart;
4412
                int currStride;
4413
                int refStride;
4414
                distAMVPBest = 0;
4415
                pixel* ref;
4416
4417
                cu.setPUMv(0, cMv, pu.puAbsPartIdx, partIdx);
4418
                cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, partIdx);
4419
                cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4420
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4421
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4422
                motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4423
4424
                for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4425
                {
4426
                    int tempHeight, tempWidth;
4427
                    if (ch == 0)
4428
                    {
4429
                        tempHeight = dummyHeight;
4430
                        tempWidth = dummyWidth;
4431
                        ref = tmpPredYuv.getLumaAddr(partAddr);
4432
                        refStride = tmpPredYuv.m_size;
4433
                        distAMVPBest += m_me.bufSAD(ref, refStride);
4434
                    }
4435
                    else
4436
                    {
4437
                        tempHeight = dummyHeight >> m_vChromaShift;
4438
                        tempWidth = dummyWidth >> m_hChromaShift;
4439
4440
                        currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4441
                        currStride = intraBCMixedMode.fencYuv->m_csize;
4442
                        ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4443
                        refStride = tmpPredYuv.m_csize;
4444
                        distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4445
                    }
4446
                }
4447
4448
                MV check;
4449
                for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++)
4450
                {
4451
                    m_me.setMVP(bvPred[mvpIdxTemp]);
4452
                    bitsAMVPTemp = m_me.bitcost(cMv >> 2, bvPred[mvpIdxTemp]);
4453
                    if (bitsAMVPTemp < bitsAMVPBest)
4454
                    {
4455
                        bitsAMVPBest = bitsAMVPTemp;
4456
                        mvpIdxBest = mvpIdxTemp;
4457
                    }
4458
                }
4459
4460
                bitsAMVPBest++; // for MVP Index bits
4461
                costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest);
4462
4463
                MVField cMvFieldNeighboursIBC[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
4464
                uint8_t uhInterDirNeighboursIBC[MRG_MAX_NUM_CANDS];
4465
                int numValidMergeCandIBC = 0;
4466
4467
                if (ePartSize != SIZE_2Nx2N)
4468
                {
4469
                    if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8)
4470
                    {
4471
                        cu.setPartSizeSubParts(SIZE_2Nx2N);
4472
                        if (partIdx == 0)
4473
                        {
4474
                            numValidMergeCandIBC = cu.getInterMergeCandidates(0, 0, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4475
                        }
4476
                        cu.setPartSizeSubParts(ePartSize);
4477
                    }
4478
                    else
4479
                    {
4480
                        numValidMergeCandIBC = cu.getInterMergeCandidates(pu.puAbsPartIdx, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC);
4481
                    }
4482
4483
                    cu.roundMergeCandidates(cMvFieldNeighboursIBC, numValidMergeCandIBC);
4484
                    restrictBipredMergeCand(&cu, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC, numValidMergeCandIBC);
4485
4486
                    for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCandIBC; mrgIdxTemp++)
4487
                    {
4488
                        if (uhInterDirNeighboursIBC[mrgIdxTemp] != 1)
4489
                        {
4490
                            continue;
4491
                        }
4492
                        if (m_slice->m_refPOCList[0][cMvFieldNeighboursIBC[mrgIdxTemp][0].refIdx] != m_slice->m_poc)
4493
                        {
4494
                            continue;
4495
                        }
4496
4497
                        if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, dummyWidth, dummyHeight, &cu,
4498
                            xStartInCU, yStartInCU, (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize))
4499
                        {
4500
                            continue;
4501
                        }
4502
                        bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1;
4503
4504
                        distMergeTemp = 0;
4505
                        cu.setPUMv(0, cMvFieldNeighboursIBC[mrgIdxTemp][0].mv, pu.puAbsPartIdx, partIdx);
4506
                        cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4507
                        cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx);
4508
                        cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4509
                        cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);
4510
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4511
4512
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4513
                        {
4514
                            int tempHeight, tempWidth;
4515
                            if (ch == 0)
4516
                            {
4517
                                tempHeight = dummyHeight;
4518
                                tempWidth = dummyWidth;
4519
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4520
                                refStride = tmpPredYuv.m_size;
4521
                                distMergeTemp += m_me.bufSAD(ref, refStride);
4522
                            }
4523
                            else
4524
                            {
4525
                                tempHeight = dummyHeight >> m_vChromaShift;
4526
                                tempWidth = dummyWidth >> m_hChromaShift;
4527
4528
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4529
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4530
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4531
                                refStride = tmpPredYuv.m_csize;
4532
                                distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4533
                            }
4534
                        }
4535
                        costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp);
4536
4537
                        if (costMergeTemp < costMergeBest)
4538
                        {
4539
                            costMergeBest = costMergeTemp;
4540
                            mrgIdxBest = mrgIdxTemp;
4541
                        }
4542
                    }
4543
                }
4544
4545
                if (costMergeBest < costAMVPBest)
4546
                {
4547
                    cost[combo] += costMergeBest;
4548
                    isIBCMergeMode[combo] = true;
4549
                    bestIBCMvpIdx[combo] = mrgIdxBest;
4550
4551
                    MVField mvField[2];
4552
                    MV mv(cMvFieldNeighboursIBC[mrgIdxBest][0].mv.x, cMvFieldNeighboursIBC[mrgIdxBest][0].mv.y);
4553
                    mvField[0].mv = mv;
4554
                    mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1;   // the current picture is at the last position of list0
4555
                    mvField[1].mv = cMvZero;
4556
                    mvField[1].refIdx = REF_NOT_VALID;
4557
                    cMRGMvFieldIBC[combo][0] = mvField[0];
4558
                    cMRGMvFieldIBC[combo][1] = mvField[1];
4559
                }
4560
                else
4561
                {
4562
                    cost[combo] += costAMVPBest;
4563
                    isIBCMergeMode[combo] = false;
4564
                    bestIBCMvpIdx[combo] = mvpIdxBest;
4565
                    cMvPredCand[combo][partIdx].set(bvPred[mvpIdxBest].x << 2, bvPred[mvpIdxBest].y << 2);
4566
                }
4567
4568
                cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx);  // list 0 prediction
4569
                if (isIBCMergeMode[combo])
4570
                {
4571
                    cu.setPUMv(0, cMRGMvFieldIBC[combo][0].mv, pu.puAbsPartIdx, partIdx);
4572
                }
4573
                else
4574
                {
4575
                    cu.setPUMv(0, iMvCandList[8 + partIdx], pu.puAbsPartIdx, partIdx);
4576
                    cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx);
4577
                    cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4578
                }
4579
                // ibc merge
4580
                /////////////////////////////////////////////////////////////
4581
            }
4582
            else // is inter PU
4583
            {
4584
                uint32_t  costInterTemp = 0;
4585
                uint32_t  costInterBest = UINT32_MAX;
4586
                const pixel* currStart;
4587
                int currStride;
4588
                pixel* ref;
4589
                int refStride;
4590
                MergeData merge;
4591
                memset(&merge, 0, sizeof(merge));
4592
                for (int refList = 0; refList < numPredDir; refList++)
4593
                {
4594
                    uint32_t numRef = refList ? ((m_slice->m_numRefIdx[1] > 1) ? 2 : 1) : ((m_slice->m_numRefIdx[0] - 1 > 1) ? 2 : 1);
4595
                    for (uint32_t refIdx = 0; refIdx < numRef; refIdx++)
4596
                    {
4597
                        MV cMv = iMvCandList[4 * refList + 2 * refIdx + partIdx];
4598
4599
                        cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours);
4600
                        cu.getPMV(intraBCMixedMode.interNeighbours, refList, refIdx, intraBCMixedMode.amvpCand[refList][refIdx], mvc, partIdx, pu.puAbsPartIdx);
4601
                        int mvpIdx;
4602
4603
                        uint32_t  tempCost0 = 0;
4604
                        uint32_t  tempCost1 = 0;
4605
                        mvPred[0] = intraBCMixedMode.amvpCand[refList][refIdx][0];
4606
                        mvPred[1] = intraBCMixedMode.amvpCand[refList][refIdx][1];
4607
4608
                        m_me.setMVP(mvPred[0]);
4609
                        tempCost0 = m_me.bitcost(cMv, mvPred[0]);
4610
                        m_me.setMVP(mvPred[1]);
4611
                        tempCost1 = m_me.bitcost(cMv, mvPred[1]);
4612
                        if (tempCost1 < tempCost0)
4613
                        {
4614
                            mvpIdx = 1;
4615
                        }
4616
                        else
4617
                        {
4618
                            mvpIdx = 0;
4619
                        }
4620
                        uint32_t bitsTemp = m_listSelBits[refList] + MVP_IDX_BITS;
4621
                        bitsTemp += getTUBits(refIdx, numRef);
4622
4623
                        m_me.setMVP(mvPred[mvpIdx]);
4624
                        if (cu.m_slice->m_useIntegerMv)
4625
                        {
4626
                            cu.setPUMv(refList, (cMv >> 2) << 2, pu.puAbsPartIdx, partIdx);
4627
                        }
4628
                        else
4629
                        {
4630
                            cu.setPUMv(refList, cMv, pu.puAbsPartIdx, partIdx);
4631
                        }
4632
                        cu.setPURefIdx(refList, refIdx, pu.puAbsPartIdx, partIdx);
4633
                        cu.setPUInterDir(1 + refList, pu.puAbsPartIdx, partIdx);
4634
                        motionCompensation(cu, pu, tmpPredYuv, 1, 1);
4635
4636
                        costInterTemp = 0;
4637
                        for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++)
4638
                        {
4639
                            int tempHeight, tempWidth;
4640
                            if (ch == 0)
4641
                            {
4642
                                tempHeight = dummyHeight;
4643
                                tempWidth = dummyWidth;
4644
                                ref = tmpPredYuv.getLumaAddr(partAddr);
4645
                                refStride = tmpPredYuv.m_size;
4646
                                costInterTemp += m_me.bufSAD(ref, refStride);
4647
                            }
4648
                            else
4649
                            {
4650
                                tempHeight = dummyHeight >> m_vChromaShift;
4651
                                tempWidth = dummyWidth >> m_hChromaShift;
4652
4653
                                currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr);
4654
                                currStride = intraBCMixedMode.fencYuv->m_csize;
4655
                                ref = tmpPredYuv.getChromaAddr(ch, partAddr);
4656
                                refStride = tmpPredYuv.m_csize;
4657
                                costInterTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight);
4658
                            }
4659
4660
                            if (costInterTemp >= costInterBest)
4661
                            {
4662
                                break;
4663
                            }
4664
                        }
4665
                        cu.setPURefIdx(refList, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4666
4667
                        costInterTemp += m_me.bitcost(cMv, mvPred[mvpIdx]);
4668
                        costInterTemp += m_rdCost.getCost(bitsTemp);
4669
4670
                        if (costInterTemp < costInterBest)
4671
                        {
4672
                            costInterBest = costInterTemp;
4673
                            bestInterMvpIdx[combo] = mvpIdx;
4674
                            bestInterDir[combo] = refList;
4675
                            bestRefIdx[combo] = refIdx;
4676
                            cMvPredCand[combo][partIdx] = mvPred[mvpIdx];
4677
                        }
4678
                    }
4679
                } // end RefIdx and RefList search
4680
4681
                uint32_t MRGInterDir = 0;
4682
                uint32_t MRGIndex = 0;
4683
4684
                // find Merge result
4685
                uint32_t MRGCost = UINT32_MAX;
4686
                cu.m_mergeFlag[pu.puAbsPartIdx] = true;
4687
4688
                mergeEstimation(cu, cuGeom, pu, partIdx, merge);
4689
                MRGInterDir = merge.dir;
4690
                cMRGMvField[combo][0] = merge.mvField[0];
4691
                cMRGMvField[combo][1] = merge.mvField[1];
4692
                MRGIndex = merge.index;
4693
                cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4694
                cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4695
4696
                if (MRGCost < costInterBest)
4697
                {
4698
                    costInterBest = MRGCost;
4699
                    isMergeMode[combo] = true;
4700
                    bestInterMvpIdx[combo] = MRGIndex;
4701
                    bestInterDir[combo] = MRGInterDir;
4702
                }
4703
4704
                cost[combo] += costInterBest;
4705
                if (isMergeMode[combo])
4706
                {
4707
                    cu.setPUInterDir(bestInterDir[combo], pu.puAbsPartIdx, partIdx);
4708
                    cu.setPUMv(0, cMRGMvField[combo][0].mv, pu.puAbsPartIdx, partIdx);
4709
                    cu.setPURefIdx(0, cMRGMvField[combo][0].refIdx, pu.puAbsPartIdx, partIdx);
4710
                    cu.setPUMv(1, cMRGMvField[combo][1].mv, pu.puAbsPartIdx, partIdx);
4711
                    cu.setPURefIdx(1, cMRGMvField[combo][1].refIdx, pu.puAbsPartIdx, partIdx);
4712
                }
4713
                else
4714
                {
4715
                    int refListOpt = bestInterDir[combo];
4716
                    int refIdxOpt = bestRefIdx[combo];
4717
                    if (cu.m_slice->m_useIntegerMv)
4718
                    {
4719
                        cu.setPUMv(refListOpt, (iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, pu.puAbsPartIdx, partIdx);
4720
                    }
4721
                    else
4722
                    {
4723
                        cu.setPUMv(refListOpt, iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt], pu.puAbsPartIdx, partIdx);
4724
                    }
4725
                    cu.setPURefIdx(refListOpt, refIdxOpt, pu.puAbsPartIdx, partIdx);
4726
                    cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, pu.puAbsPartIdx, partIdx);
4727
                    cu.setPUInterDir(1 + refListOpt, pu.puAbsPartIdx, partIdx);
4728
                    cu.m_mvpIdx[refListOpt][pu.puAbsPartIdx] = bestInterMvpIdx[combo];
4729
                }
4730
            }
4731
        } // for ipartIdx
4732
    } // for combo
4733
4734
    if (IBCValidFlag > 1)
4735
    {
4736
        return false;
4737
    }
4738
4739
    MV cMvd;
4740
    MV cMVFinal;
4741
    if (cost[0] <= cost[1])
4742
    {
4743
        int iDummyWidth1, iDummyHeight1;
4744
        uint32_t partAddr = 0;
4745
        uint32_t partIdx = 0;
4746
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4747
4748
        if (isIBCMergeMode[0])
4749
        {
4750
            cu.m_mergeFlag[partAddr] = true;
4751
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4752
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4753
            cu.setPUMv(0, cMRGMvFieldIBC[0][0].mv, partAddr, partIdx);
4754
            cu.setPURefIdx(0, cMRGMvFieldIBC[0][0].refIdx, partAddr, partIdx);
4755
            cu.setPUMv(1, cMRGMvFieldIBC[0][1].mv, partAddr, partIdx);
4756
            cu.setPURefIdx(1, cMRGMvFieldIBC[0][1].refIdx, partAddr, partIdx);
4757
4758
            cu.m_mvd[0][partAddr] = cMvZero;
4759
            cu.m_mvd[1][partAddr] = cMvZero;
4760
        }
4761
        else
4762
        {
4763
            cu.m_mergeFlag[partAddr] = false;
4764
4765
            cMvd.set((iMvCandList[8].x - cMvPredCand[0][0].x) >> 2, (iMvCandList[8].y - cMvPredCand[0][0].y) >> 2);
4766
            cu.setPUMv(0, iMvCandList[8], partAddr, partIdx);
4767
            cu.m_mvd[0][partAddr] = cMvd;
4768
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0];
4769
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4770
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4771
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4772
        }
4773
4774
        partIdx = 1;
4775
        cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1);
4776
4777
        if (isMergeMode[0])
4778
        {
4779
            cu.m_mergeFlag[partAddr] = true;
4780
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[0];
4781
            cu.setPUInterDir(bestInterDir[0], partAddr, partIdx);  // list 0 prediction
4782
            cu.setPUMv(0, cMRGMvField[0][0].mv, partAddr, partIdx);
4783
            cu.setPURefIdx(0, cMRGMvField[0][0].refIdx, partAddr, partIdx);
4784
            cu.setPUMv(1, cMRGMvField[0][1].mv, partAddr, partIdx);
4785
            cu.setPURefIdx(1, cMRGMvField[0][1].refIdx, partAddr, partIdx);
4786
4787
            cu.m_mvd[0][partAddr] = cMvZero;
4788
            cu.m_mvd[1][partAddr] = cMvZero;
4789
        }
4790
        else
4791
        {
4792
            int refListOpt = bestInterDir[0];
4793
            int refIdxOpt = bestRefIdx[0];
4794
            if (cu.m_slice->m_useIntegerMv)
4795
            {
4796
                cMvd.set(((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[0][1].x >> 2)), ((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[0][1].y >> 2)));
4797
                cu.setPUMv(refListOpt, (iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4798
            }
4799
            else
4800
            {
4801
                cMvd.set(iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[0][1].x, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[0][1].y);
4802
                cu.setPUMv(refListOpt, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4803
            }
4804
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4805
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4806
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4807
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4808
            cu.m_mergeFlag[partAddr] = false;
4809
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[0];
4810
        }
4811
    }
4812
    else
4813
    {
4814
        int dummyWidth2, dummyHeight2;
4815
        uint32_t partAddr = 0;
4816
        uint32_t partIdx = 0;
4817
4818
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4819
4820
        if (isMergeMode[1])
4821
        {
4822
            cu.m_mergeFlag[partAddr] = true;
4823
            cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[1];
4824
            cu.setPUInterDir(bestInterDir[1], partAddr, partIdx);  // list 0 prediction
4825
            cu.setPUMv(0, cMRGMvField[1][0].mv, partAddr, partIdx);
4826
            cu.setPURefIdx(0, cMRGMvField[1][0].refIdx, partAddr, partIdx);
4827
            cu.setPUMv(1, cMRGMvField[1][1].mv, partAddr, partIdx);
4828
            cu.setPURefIdx(1, cMRGMvField[1][1].refIdx, partAddr, partIdx);
4829
4830
            cu.m_mvd[0][partAddr] = cMvZero;
4831
            cu.m_mvd[1][partAddr] = cMvZero;
4832
        }
4833
        else
4834
        {
4835
            int refListOpt = bestInterDir[1];
4836
            int refIdxOpt = bestRefIdx[1];
4837
            if (cu.m_slice->m_useIntegerMv)
4838
            {
4839
                cMvd.set((iMvCandList[2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[1][0].x >> 2), (iMvCandList[2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[1][0].y >> 2));
4840
                cu.setPUMv(refListOpt, (iMvCandList[2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx);
4841
            }
4842
            else
4843
            {
4844
                cMvd.set(iMvCandList[2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[1][0].x, iMvCandList[2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[1][0].y);
4845
                cu.setPUMv(refListOpt, iMvCandList[2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx);
4846
            }
4847
            cu.m_mvd[refListOpt][partAddr] = cMvd;
4848
            cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx);
4849
            cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx);
4850
            cu.setPUInterDir(1 + refListOpt, partAddr, partIdx);
4851
            cu.m_mergeFlag[partAddr] = false;
4852
            cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[1];
4853
        }
4854
4855
        partIdx = 1;
4856
        cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2);
4857
4858
        if (isIBCMergeMode[1])
4859
        {
4860
            cu.m_mergeFlag[partAddr] = true;
4861
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4862
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4863
            cu.setPUMv(0, cMRGMvFieldIBC[1][0].mv, partAddr, partIdx);
4864
            cu.setPURefIdx(0, cMRGMvFieldIBC[1][0].refIdx, partAddr, partIdx);
4865
            cu.setPUMv(1, cMRGMvFieldIBC[1][1].mv, partAddr, partIdx);
4866
            cu.setPURefIdx(1, cMRGMvFieldIBC[1][1].refIdx, partAddr, partIdx);
4867
4868
            cu.m_mvd[0][partAddr] = cMvZero;
4869
            cu.m_mvd[1][partAddr] = cMvZero;
4870
        }
4871
        else
4872
        {
4873
            cu.m_mergeFlag[partAddr] = false;
4874
4875
            cMvd.set(((iMvCandList[9].x - cMvPredCand[1][1].x) >> 2), (iMvCandList[9].y - cMvPredCand[1][1].y) >> 2);
4876
            cu.setPUMv(0, iMvCandList[9], partAddr, partIdx);
4877
            cu.m_mvd[0][partAddr] = cMvd;
4878
            cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1];
4879
            cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx);
4880
            cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx);
4881
            cu.setPUInterDir(1, partAddr, partIdx);  // list 0 prediction
4882
        }
4883
    }
4884
    for (int partIdx = 0; partIdx < numPart; ++partIdx)
4885
    {
4886
        PredictionUnit pu(cu, cuGeom, partIdx);
4887
        motionCompensation(cu, pu, *predYuv, 1, 1);
4888
    }
4889
4890
    return true;
4891
}
4892
#endif
4893
4894
void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
4895
0
{
4896
0
    if (cuMode == SIZE_2Nx2N)
4897
0
    {
4898
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4899
0
        blockBit[1] = 3;
4900
0
        blockBit[2] = 5;
4901
0
    }
4902
0
    else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
4903
0
    {
4904
0
        static const uint32_t listBits[2][3][3] =
4905
0
        {
4906
0
            { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4907
0
            { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
4908
0
        };
4909
0
        if (bPSlice)
4910
0
        {
4911
0
            blockBit[0] = 3;
4912
0
            blockBit[1] = 0;
4913
0
            blockBit[2] = 0;
4914
0
        }
4915
0
        else
4916
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4917
0
    }
4918
0
    else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
4919
0
    {
4920
0
        static const uint32_t listBits[2][3][3] =
4921
0
        {
4922
0
            { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
4923
0
            { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
4924
0
        };
4925
0
        if (bPSlice)
4926
0
        {
4927
0
            blockBit[0] = 3;
4928
0
            blockBit[1] = 0;
4929
0
            blockBit[2] = 0;
4930
0
        }
4931
0
        else
4932
0
            memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
4933
0
    }
4934
0
    else if (cuMode == SIZE_NxN)
4935
0
    {
4936
0
        blockBit[0] = (!bPSlice) ? 3 : 1;
4937
0
        blockBit[1] = 3;
4938
0
        blockBit[2] = 5;
4939
0
    }
4940
0
    else
4941
0
    {
4942
0
        X265_CHECK(0, "getBlkBits: unknown cuMode\n");
4943
0
    }
4944
0
}
4945
4946
/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
4947
const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const
4948
0
{
4949
0
    int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]);
4950
0
    if (diffBits < 0)
4951
0
    {
4952
0
        mvpIdx = !mvpIdx;
4953
0
        uint32_t origOutBits = outBits;
4954
0
        outBits = origOutBits + diffBits;
4955
0
        outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4956
0
    }
4957
0
    return amvpCand[mvpIdx];
4958
0
}
4959
4960
/* Update to default MVP when using an alternative mvp */
4961
void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
4962
0
{
4963
0
    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
4964
0
    uint32_t origOutBits = outBits;
4965
0
    outBits = origOutBits + diffBits;
4966
0
    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
4967
0
}
4968
4969
void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
4970
0
{
4971
0
    MV dist((int32_t)merange << 2, (int32_t)merange << 2);
4972
0
    mvmin = mvp - dist;
4973
0
    mvmax = mvp + dist;
4974
4975
0
    if (m_vertRestriction)
4976
0
    {
4977
0
        int mvRestricted = (56 - 1) << 2; // -1 to consider subpel search
4978
0
        if (mvmax.y >= mvRestricted)
4979
0
        {
4980
0
            mvmax.y = mvRestricted; //only positive side is restricted
4981
0
        }
4982
0
    }
4983
4984
0
    cu.clipMv(mvmin);
4985
0
    cu.clipMv(mvmax);
4986
4987
0
    if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
4988
0
          cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
4989
0
          m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
4990
0
    {
4991
0
        int safeX, maxSafeMv;
4992
0
        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;
4993
0
        maxSafeMv = (safeX - cu.m_cuPelX) * 4;
4994
0
        mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
4995
0
        mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
4996
0
    }
4997
4998
    // apply restrict on slices
4999
0
    if ((m_param->maxSlices > 1) & m_bFrameParallel)
5000
0
    {
5001
0
        mvmin.y = X265_MAX(mvmin.y, m_sliceMinY);
5002
0
        mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY);
5003
0
    }
5004
5005
    /* Clip search range to signaled maximum MV length.
5006
     * We do not support this VUI field being changed from the default */
5007
0
    const int maxMvLen = (1 << 15) - 1;
5008
0
    mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
5009
0
    mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
5010
0
    mvmax.x = X265_MIN(mvmax.x, maxMvLen);
5011
0
    mvmax.y = X265_MIN(mvmax.y, maxMvLen);
5012
5013
0
    mvmin >>= 2;
5014
0
    mvmax >>= 2;
5015
5016
    /* conditional clipping for frame parallelism */
5017
0
    mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels);
5018
0
    mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels);
5019
5020
    /* conditional clipping for negative mv range */
5021
0
    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
5022
0
}
5023
5024
/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
5025
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
5026
0
{
5027
0
    CUData& cu = interMode.cu;
5028
0
    Yuv* reconYuv = &interMode.reconYuv;
5029
0
    const Yuv* fencYuv = interMode.fencYuv;
5030
0
    Yuv* predYuv = &interMode.predYuv;
5031
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
5032
0
    uint32_t depth  = cu.m_cuDepth[0];
5033
5034
    // No residual coding : SKIP mode
5035
5036
0
    cu.setPredModeSubParts(MODE_SKIP);
5037
0
    cu.clearCbf();
5038
0
    cu.setTUDepthSubParts(0, 0, depth);
5039
5040
0
    reconYuv->copyFromYuv(interMode.predYuv);
5041
5042
    // Luma
5043
0
    int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
5044
0
    interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5045
0
    interMode.distortion = interMode.lumaDistortion;
5046
    // Chroma
5047
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5048
0
    {
5049
0
        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
5050
0
        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
5051
0
        interMode.distortion += interMode.chromaDistortion;
5052
0
    }
5053
0
    cu.m_distortion[0] = interMode.distortion;
5054
0
    m_entropyCoder.load(m_rqt[depth].cur);
5055
0
    m_entropyCoder.resetBits();
5056
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
5057
0
        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
5058
0
    m_entropyCoder.codeSkipFlag(cu, 0);
5059
0
    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5060
0
    m_entropyCoder.codeMergeIndex(cu, 0);
5061
0
    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5062
0
    interMode.coeffBits = 0;
5063
0
    interMode.totalBits = interMode.mvBits + skipFlagBits;
5064
0
    if (m_rdCost.m_psyRd)
5065
0
        interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5066
0
    else if(m_rdCost.m_ssimRd)
5067
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
5068
5069
0
    interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5070
0
    updateModeCost(interMode);
5071
0
    m_entropyCoder.store(interMode.contexts);
5072
0
}
5073
5074
/* encode residual and calculate rate-distortion for a CU block.
5075
 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
5076
void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
5077
0
{
5078
0
    ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);
5079
5080
0
    CUData& cu = interMode.cu;
5081
0
    Yuv* reconYuv = &interMode.reconYuv;
5082
0
    Yuv* predYuv = &interMode.predYuv;
5083
0
    uint32_t depth = cuGeom.depth;
5084
0
    ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;
5085
0
    const Yuv* fencYuv = interMode.fencYuv;
5086
5087
0
    X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
5088
5089
0
    uint32_t log2CUSize = cuGeom.log2CUSize;
5090
0
    int sizeIdx = log2CUSize - 2;
5091
5092
0
    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
5093
5094
0
    uint32_t tuDepthRange[2];
5095
0
    cu.getInterTUQtDepthRange(tuDepthRange, 0);
5096
5097
0
    m_entropyCoder.load(m_rqt[depth].cur);
5098
5099
0
    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
5100
0
        m_maxTUDepth = -1;
5101
0
    else if (m_limitTU & X265_TU_LIMIT_BFS)
5102
0
        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
5103
5104
0
    Cost costs;
5105
0
    if (m_limitTU & X265_TU_LIMIT_NEIGH)
5106
0
    {
5107
        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
5108
0
        int32_t tempDepth = m_maxTUDepth;
5109
0
        if (m_maxTUDepth != -1)
5110
0
        {
5111
0
            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
5112
0
            uint32_t minSize = tuDepthRange[0];
5113
0
            uint32_t maxSize = tuDepthRange[1];
5114
0
            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
5115
0
            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
5116
0
        }
5117
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
5118
0
        m_maxTUDepth = tempDepth;
5119
0
    }
5120
0
    else
5121
0
        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
5122
5123
0
    uint32_t tqBypass = cu.m_tqBypass[0];
5124
0
    if (!tqBypass)
5125
0
    {
5126
0
        sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5127
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5128
0
        {
5129
0
            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
5130
0
            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
5131
0
        }
5132
5133
        /* Consider the RD cost of not signaling any residual */
5134
0
        m_entropyCoder.load(m_rqt[depth].cur);
5135
0
        m_entropyCoder.resetBits();
5136
0
        m_entropyCoder.codeQtRootCbfZero();
5137
0
        uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
5138
5139
0
        uint32_t cbf0Energy; uint64_t cbf0Cost;
5140
0
        if (m_rdCost.m_psyRd)
5141
0
        {
5142
0
            cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5143
0
            cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
5144
0
        }
5145
0
        else if(m_rdCost.m_ssimRd)
5146
0
        {
5147
0
            cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);
5148
0
            cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
5149
0
        }
5150
0
        else
5151
0
            cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
5152
5153
0
        if (cbf0Cost < costs.rdcost)
5154
0
        {
5155
0
            cu.clearCbf();
5156
0
            cu.setTUDepthSubParts(0, 0, depth);
5157
0
        }
5158
0
    }
5159
5160
0
    if (cu.getQtRootCbf(0))
5161
0
        saveResidualQTData(cu, *resiYuv, 0, 0);
5162
5163
    /* calculate signal bits for inter/merge/skip coded CU */
5164
0
    m_entropyCoder.load(m_rqt[depth].cur);
5165
5166
0
    m_entropyCoder.resetBits();
5167
0
    if (m_slice->m_pps->bTransquantBypassEnabled)
5168
0
        m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
5169
5170
0
    uint32_t coeffBits, bits, mvBits;
5171
0
    if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
5172
0
    {
5173
0
        cu.setPredModeSubParts(MODE_SKIP);
5174
5175
        /* Merge/Skip */
5176
0
        coeffBits = mvBits = 0;
5177
0
        m_entropyCoder.codeSkipFlag(cu, 0);
5178
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5179
0
        m_entropyCoder.codeMergeIndex(cu, 0);
5180
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5181
0
        bits = mvBits + skipFlagBits;
5182
0
    }
5183
0
    else
5184
0
    {
5185
0
        m_entropyCoder.codeSkipFlag(cu, 0);
5186
0
        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
5187
0
        m_entropyCoder.codePredMode(cu.m_predMode[0]);
5188
0
        m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
5189
0
        m_entropyCoder.codePredInfo(cu, 0);
5190
0
        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
5191
5192
0
        bool bCodeDQP = m_slice->m_pps->bUseDQP;
5193
0
        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
5194
0
        bits = m_entropyCoder.getNumberOfWrittenBits();
5195
5196
0
        coeffBits = bits - mvBits - skipFlagBits;
5197
0
    }
5198
5199
0
    m_entropyCoder.store(interMode.contexts);
5200
5201
0
    if (cu.getQtRootCbf(0))
5202
0
        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
5203
0
    else
5204
0
        reconYuv->copyFromYuv(*predYuv);
5205
5206
    // update with clipped distortion and cost (qp estimation loop uses unclipped values)
5207
0
    sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5208
0
    interMode.distortion = bestLumaDist;
5209
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5210
0
    {
5211
0
        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
5212
0
        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
5213
0
        interMode.chromaDistortion = bestChromaDist;
5214
0
        interMode.distortion += bestChromaDist;
5215
0
    }
5216
0
    if (m_rdCost.m_psyRd)
5217
0
        interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
5218
0
    else if(m_rdCost.m_ssimRd)
5219
0
        interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);
5220
5221
0
    interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
5222
0
    interMode.totalBits = bits;
5223
0
    interMode.lumaDistortion = bestLumaDist;
5224
0
    interMode.coeffBits = coeffBits;
5225
0
    interMode.mvBits = mvBits;
5226
0
    cu.m_distortion[0] = interMode.distortion;
5227
0
    updateModeCost(interMode);
5228
0
    checkDQP(interMode, cuGeom);
5229
5230
#if ENABLE_SCC_EXT
5231
    if (m_param->bEnableSCC)
5232
        interMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx);
5233
#endif
5234
0
}
5235
5236
void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2])
5237
0
{
5238
0
    uint32_t depth = cuGeom.depth + tuDepth;
5239
0
    CUData& cu = mode.cu;
5240
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5241
5242
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5243
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0])
5244
0
        bCheckFull = false;
5245
5246
0
    if (bCheckFull)
5247
0
    {
5248
        // code full block
5249
0
        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5250
0
        uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5251
5252
0
        uint32_t tuDepthC = tuDepth;
5253
0
        if (log2TrSizeC < 2)
5254
0
        {
5255
0
            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5256
0
            log2TrSizeC = 2;
5257
0
            tuDepthC--;
5258
0
            codeChroma &= !(absPartIdx & 3);
5259
0
        }
5260
5261
0
        uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5262
0
        uint32_t setCbf = 1 << tuDepth;
5263
5264
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5265
0
        coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
5266
5267
0
        uint32_t sizeIdx  = log2TrSize  - 2;
5268
5269
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5270
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5271
5272
0
        ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
5273
0
        const Yuv* fencYuv = mode.fencYuv;
5274
5275
0
        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
5276
0
        uint32_t strideResiY = resiYuv.m_size;
5277
5278
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5279
0
        uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5280
5281
0
        if (numSigY)
5282
0
        {
5283
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
5284
0
            cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
5285
0
        }
5286
0
        else
5287
0
        {
5288
0
            primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5289
0
            cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
5290
0
        }
5291
5292
0
        if (codeChroma)
5293
0
        {
5294
0
            uint32_t sizeIdxC = log2TrSizeC - 2;
5295
0
            uint32_t strideResiC = resiYuv.m_csize;
5296
5297
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5298
0
            coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
5299
0
            coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
5300
0
            bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5301
5302
0
            TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5303
0
            do
5304
0
            {
5305
0
                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5306
0
                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5307
5308
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5309
0
                cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5310
5311
0
                int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
5312
0
                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
5313
0
                uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
5314
0
                if (numSigU)
5315
0
                {
5316
0
                    m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
5317
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5318
0
                }
5319
0
                else
5320
0
                {
5321
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
5322
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
5323
0
                }
5324
5325
0
                int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
5326
0
                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
5327
0
                uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
5328
0
                if (numSigV)
5329
0
                {
5330
0
                    m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
5331
0
                    cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5332
0
                }
5333
0
                else
5334
0
                {
5335
0
                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
5336
0
                    cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
5337
0
                }
5338
0
            }
5339
0
            while (tuIterator.isNextSection());
5340
5341
0
            if (splitIntoSubTUs)
5342
0
            {
5343
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5344
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5345
0
            }
5346
0
        }
5347
0
    }
5348
0
    else
5349
0
    {
5350
0
        X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
5351
5352
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5353
0
        uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
5354
0
        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
5355
0
        {
5356
0
            residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
5357
0
            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
5358
0
            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5359
0
            {
5360
0
                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
5361
0
                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
5362
0
            }
5363
0
        }
5364
0
        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
5365
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5366
0
        {
5367
0
            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
5368
0
            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
5369
0
        }
5370
0
    }
5371
0
}
5372
5373
uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId)
5374
0
{
5375
0
    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
5376
5377
0
    if (m_rdCost.m_psyRd)
5378
0
        return m_rdCost.calcPsyRdCost(dist, nullBits, energy);
5379
0
    else if(m_rdCost.m_ssimRd)
5380
0
        return m_rdCost.calcSsimRdCost(dist, nullBits, energy);
5381
0
    else
5382
0
        return m_rdCost.calcRdCost(dist, nullBits);
5383
0
}
5384
5385
bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
5386
0
{
5387
0
    CUData& cu = mode.cu;
5388
0
    uint32_t depth = cuGeom.depth + tuDepth;
5389
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5390
5391
0
    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
5392
0
    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
5393
0
    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
5394
0
    {
5395
0
        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
5396
0
        {
5397
0
            m_maxTUDepth = cu.m_tuDepth[0];
5398
            // Fetch maximum TU depth of first sub partition to limit recursion of others
5399
0
            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
5400
0
                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
5401
0
        }
5402
0
        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
5403
0
        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
5404
0
        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5405
0
        {
5406
0
            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
5407
0
            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
5408
0
        }
5409
0
    }
5410
0
    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
5411
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
5412
0
    {
5413
0
        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
5414
0
        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
5415
0
    }
5416
5417
    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
5418
    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
5419
    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
5420
    // at depth 0 (for example).
5421
0
    m_entropyCoder.load(m_rqt[depth].rqtRoot);
5422
0
    m_entropyCoder.resetBits();
5423
0
    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
5424
0
    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
5425
0
    splitCost.bits += splitCbfBits;
5426
5427
0
    if (m_rdCost.m_psyRd)
5428
0
        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
5429
0
    else if(m_rdCost.m_ssimRd)
5430
0
        splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
5431
0
    else
5432
0
        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
5433
        
5434
0
    return ycbf || ucbf || vcbf;
5435
0
}
5436
5437
void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
5438
0
{
5439
0
    CUData& cu = mode.cu;
5440
0
    uint32_t depth = cuGeom.depth + tuDepth;
5441
0
    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
5442
0
    bool bEnableRDOQ = !!m_param->rdoqLevel;
5443
5444
0
    bool bCheckSplit = log2TrSize > depthRange[0];
5445
0
    bool bCheckFull = log2TrSize <= depthRange[1];
5446
0
    bool bSaveTUData = false, bLoadTUData = false;
5447
0
    uint32_t idx = 0;
5448
5449
0
    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
5450
0
    {
5451
0
        if (bCheckSplit && bCheckFull && tuDepth)
5452
0
        {
5453
0
            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
5454
0
            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
5455
0
            idx = (depth - 1) * 4 + qIdx;
5456
0
            if (splitMore)
5457
0
            {
5458
0
                bLoadTUData = true;
5459
0
                bCheckFull = false;
5460
0
            }
5461
0
            else
5462
0
            {
5463
0
                bSaveTUData = true;
5464
0
                bCheckSplit = false;
5465
0
            }
5466
0
        }
5467
0
    }
5468
0
    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
5469
0
    {
5470
0
        if (bCheckSplit && m_maxTUDepth >= 0)
5471
0
        {
5472
0
            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
5473
0
            bCheckSplit = log2TrSize > log2MaxTrSize;
5474
0
        }
5475
0
    }
5476
5477
0
    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
5478
5479
0
    if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
5480
0
        bCheckFull = false;
5481
5482
0
    X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
5483
5484
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
5485
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
5486
0
    uint32_t tuDepthC = tuDepth;
5487
0
    if (log2TrSizeC < 2)
5488
0
    {
5489
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
5490
0
        log2TrSizeC = 2;
5491
0
        tuDepthC--;
5492
0
        codeChroma &= !(absPartIdx & 3);
5493
0
    }
5494
5495
    // code full block
5496
0
    Cost fullCost;
5497
0
    fullCost.rdcost = MAX_INT64;
5498
5499
0
    uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5500
0
    uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
5501
0
    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5502
0
    sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5503
0
    uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5504
0
    uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
5505
0
    uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
5506
5507
0
    m_entropyCoder.store(m_rqt[depth].rqtRoot);
5508
5509
0
    uint32_t trSize = 1 << log2TrSize;
5510
0
    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
5511
0
    uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
5512
0
    const Yuv* fencYuv = mode.fencYuv;
5513
5514
    // code full block
5515
0
    if (bCheckFull)
5516
0
    {
5517
0
        uint32_t trSizeC = 1 << log2TrSizeC;
5518
0
        int partSize = partitionFromLog2Size(log2TrSize);
5519
0
        int partSizeC = partitionFromLog2Size(log2TrSizeC);
5520
0
        const uint32_t qtLayer = log2TrSize - 2;
5521
0
        uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
5522
0
        coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
5523
5524
0
        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
5525
0
        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
5526
0
        bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
5527
5528
0
        cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
5529
0
        cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5530
5531
0
        if (bEnableRDOQ)
5532
0
            m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5533
5534
0
        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
5535
0
        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
5536
0
        numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
5537
0
        cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
5538
5539
0
        m_entropyCoder.resetBits();
5540
5541
0
        if (bSplitPresentFlag && log2TrSize > depthRange[0])
5542
0
            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
5543
5544
0
        if (cbfFlag[TEXT_LUMA][0])
5545
0
            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
5546
0
        singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
5547
5548
0
        X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
5549
5550
        //Assuming zero residual 
5551
0
        sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5552
0
        uint32_t zeroEnergyY = 0;
5553
0
        if (m_rdCost.m_psyRd)
5554
0
            zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
5555
0
        else if(m_rdCost.m_ssimRd)
5556
0
            zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx);
5557
5558
0
        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
5559
0
        uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
5560
5561
0
        if (cbfFlag[TEXT_LUMA][0])
5562
0
        {
5563
0
            m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
5564
5565
            // non-zero cost calculation for luma - This is an approximation
5566
            // finally we have to encode correct cbf after comparing with null cost
5567
0
            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
5568
0
            bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
5569
0
            uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
5570
0
            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5571
0
            bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
5572
0
            bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
5573
0
            primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
5574
5575
0
            const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
5576
0
            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
5577
0
            uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0;
5578
0
            if (m_rdCost.m_psyRd)
5579
0
            {
5580
0
                nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
5581
0
                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5582
0
            }
5583
0
            else if(m_rdCost.m_ssimRd)
5584
0
            {
5585
0
                nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx);
5586
0
                singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY);
5587
0
            }
5588
0
            else
5589
0
                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
5590
5591
0
            if (cu.m_tqBypass[0])
5592
0
            {
5593
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5594
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5595
0
            }
5596
0
            else
5597
0
            {
5598
                // zero-cost calculation for luma. This is an approximation
5599
                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
5600
                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
5601
0
                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5602
5603
0
                if (nullCostY < singleCostY)
5604
0
                {
5605
0
                    cbfFlag[TEXT_LUMA][0] = 0;
5606
0
                    singleBits[TEXT_LUMA][0] = 0;
5607
0
                    primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5608
#if CHECKED_BUILD || _DEBUG
5609
                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
5610
                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
5611
#endif
5612
0
                    if (checkTransformSkipY)
5613
0
                        minCost[TEXT_LUMA][0] = nullCostY;
5614
0
                    singleDist[TEXT_LUMA][0] = zeroDistY;
5615
0
                    singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5616
0
                }
5617
0
                else
5618
0
                {
5619
0
                    if (checkTransformSkipY)
5620
0
                        minCost[TEXT_LUMA][0] = singleCostY;
5621
0
                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
5622
0
                    singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5623
0
                }
5624
0
            }
5625
0
        }
5626
0
        else
5627
0
        {
5628
0
            if (checkTransformSkipY)
5629
0
                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
5630
0
            primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
5631
0
            singleDist[TEXT_LUMA][0] = zeroDistY;
5632
0
            singleBits[TEXT_LUMA][0] = 0;
5633
0
            singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
5634
0
        }
5635
5636
0
        cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5637
5638
0
        if (codeChroma)
5639
0
        {
5640
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5641
0
            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
5642
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5643
0
            {
5644
0
                sse_t zeroDistC = 0;
5645
0
                uint32_t zeroEnergyC = 0;
5646
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5647
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5648
5649
0
                do
5650
0
                {
5651
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5652
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5653
5654
0
                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5655
5656
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5657
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5658
5659
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5660
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5661
0
                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
5662
0
                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
5663
5664
0
                    uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
5665
0
                    if (cbfFlag[chromaId][tuIterator.section])
5666
0
                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5667
5668
0
                    singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
5669
5670
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5671
0
                    zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
5672
5673
                    // Assuming zero residual 
5674
0
                    if (m_rdCost.m_psyRd)
5675
0
                        zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
5676
0
                    else if(m_rdCost.m_ssimRd)
5677
0
                        zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5678
5679
0
                    if (cbfFlag[chromaId][tuIterator.section])
5680
0
                    {
5681
0
                        m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset,
5682
0
                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
5683
5684
                        // non-zero cost calculation for luma, same as luma - This is an approximation
5685
                        // finally we have to encode correct cbf after comparing with null cost
5686
0
                        pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
5687
0
                        uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
5688
0
                        bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5689
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5690
0
                        bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5691
0
                        bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
5692
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
5693
0
                        sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
5694
0
                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
5695
0
                        uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
5696
0
                        if (m_rdCost.m_psyRd)
5697
0
                        {
5698
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
5699
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5700
0
                        }
5701
0
                        else if(m_rdCost.m_ssimRd)
5702
0
                        {
5703
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5704
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5705
0
                        }
5706
0
                        else
5707
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
5708
5709
0
                        if (cu.m_tqBypass[0])
5710
0
                        {
5711
0
                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5712
0
                            singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5713
0
                        }
5714
0
                        else
5715
0
                        {
5716
                            //zero-cost calculation for chroma. This is an approximation
5717
0
                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId);
5718
5719
0
                            if (nullCostC < singleCostC)
5720
0
                            {
5721
0
                                cbfFlag[chromaId][tuIterator.section] = 0;
5722
0
                                singleBits[chromaId][tuIterator.section] = 0;
5723
0
                                primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5724
#if CHECKED_BUILD || _DEBUG
5725
                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5726
                                memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
5727
#endif
5728
0
                                if (checkTransformSkipC)
5729
0
                                    minCost[chromaId][tuIterator.section] = nullCostC;
5730
0
                                singleDist[chromaId][tuIterator.section] = zeroDistC;
5731
0
                                singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5732
0
                            }
5733
0
                            else
5734
0
                            {
5735
0
                                if (checkTransformSkipC)
5736
0
                                    minCost[chromaId][tuIterator.section] = singleCostC;
5737
0
                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5738
0
                                singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5739
0
                            }
5740
0
                        }
5741
0
                    }
5742
0
                    else
5743
0
                    {
5744
0
                        if (checkTransformSkipC)
5745
0
                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
5746
0
                        primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
5747
0
                        singleBits[chromaId][tuIterator.section] = 0;
5748
0
                        singleDist[chromaId][tuIterator.section] = zeroDistC;
5749
0
                        singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
5750
0
                    }
5751
5752
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5753
0
                }
5754
0
                while (tuIterator.isNextSection());
5755
0
            }
5756
0
        }
5757
5758
0
        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
5759
0
        {
5760
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5761
0
            {
5762
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5763
0
                do
5764
0
                {
5765
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5766
0
                    cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5767
0
                }
5768
0
                while(tuIterator.isNextSection());
5769
0
            }
5770
0
        }
5771
0
        if (checkTransformSkipY)
5772
0
        {
5773
0
            sse_t nonZeroDistY = 0;
5774
0
            uint32_t nonZeroEnergyY = 0;
5775
0
            uint64_t singleCostY = MAX_INT64;
5776
5777
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5778
5779
0
            cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
5780
5781
0
            if (bEnableRDOQ)
5782
0
                m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
5783
5784
0
            fenc = fencYuv->getLumaAddr(absPartIdx);
5785
0
            resi = resiYuv.getLumaAddr(absPartIdx);
5786
0
            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
5787
5788
0
            if (numSigTSkipY)
5789
0
            {
5790
0
                m_entropyCoder.resetBits();
5791
0
                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
5792
0
                m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
5793
0
                const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
5794
5795
0
                m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
5796
0
                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
5797
5798
0
                bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
5799
0
                primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
5800
0
                nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
5801
5802
0
                if (m_rdCost.m_psyRd)
5803
0
                {
5804
0
                    nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
5805
0
                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5806
0
                }
5807
0
                else if(m_rdCost.m_ssimRd)
5808
0
                {
5809
0
                    nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx);
5810
0
                    singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY);
5811
0
                }
5812
0
                else
5813
0
                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
5814
0
            }
5815
5816
0
            if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
5817
0
                cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
5818
0
            else
5819
0
            {
5820
0
                singleDist[TEXT_LUMA][0] = nonZeroDistY;
5821
0
                singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY;
5822
0
                cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
5823
0
                bestTransformMode[TEXT_LUMA][0] = 1;
5824
0
                if (m_param->limitTU)
5825
0
                    numSig[TEXT_LUMA][0] = numSigTSkipY;
5826
0
                uint32_t numCoeffY = 1 << (log2TrSize << 1);
5827
0
                memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
5828
0
                primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
5829
0
            }
5830
5831
0
            cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
5832
0
        }
5833
5834
0
        if (codeChroma && checkTransformSkipC)
5835
0
        {
5836
0
            sse_t nonZeroDistC = 0;
5837
0
            uint32_t nonZeroEnergyC = 0;
5838
0
            uint64_t singleCostC = MAX_INT64;
5839
0
            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
5840
0
            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
5841
5842
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
5843
5844
0
            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
5845
0
            {
5846
0
                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
5847
0
                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
5848
5849
0
                do
5850
0
                {
5851
0
                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
5852
0
                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
5853
5854
0
                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
5855
5856
0
                    cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5857
5858
0
                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
5859
0
                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
5860
5861
0
                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
5862
0
                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
5863
0
                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
5864
5865
0
                    m_entropyCoder.resetBits();
5866
0
                    singleBits[chromaId][tuIterator.section] = 0;
5867
5868
0
                    if (numSigTSkipC)
5869
0
                    {
5870
0
                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
5871
0
                        m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
5872
0
                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
5873
5874
0
                        m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
5875
0
                                                log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
5876
0
                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
5877
0
                        bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
5878
0
                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
5879
0
                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
5880
0
                        if (m_rdCost.m_psyRd)
5881
0
                        {
5882
0
                            nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
5883
0
                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5884
0
                        }
5885
0
                        else if(m_rdCost.m_ssimRd)
5886
0
                        {
5887
0
                            nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC);
5888
0
                            singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC);
5889
0
                        }
5890
0
                        else
5891
0
                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
5892
0
                    }
5893
5894
0
                    if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
5895
0
                        cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5896
0
                    else
5897
0
                    {
5898
0
                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
5899
0
                        singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC;
5900
0
                        cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
5901
0
                        bestTransformMode[chromaId][tuIterator.section] = 1;
5902
0
                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
5903
0
                        memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
5904
0
                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
5905
0
                    }
5906
5907
0
                    cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
5908
0
                }
5909
0
                while (tuIterator.isNextSection());
5910
0
            }
5911
0
        }
5912
5913
        // Here we were encoding cbfs and coefficients, after calculating distortion above.
5914
        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
5915
        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
5916
        // make any difference. But bit confused whether I should load the original context as below.
5917
0
        m_entropyCoder.load(m_rqt[depth].rqtRoot);
5918
0
        m_entropyCoder.resetBits();
5919
5920
        //Encode cbf flags
5921
0
        if (codeChroma)
5922
0
        {
5923
0
            if (!splitIntoSubTUs)
5924
0
            {
5925
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5926
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5927
0
            }
5928
0
            else
5929
0
            {
5930
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
5931
0
                offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
5932
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth);
5933
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth);
5934
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth);
5935
0
                m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth);
5936
0
            }
5937
0
        }
5938
5939
0
        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
5940
5941
0
        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
5942
5943
0
        uint32_t coeffBits = 0;
5944
0
        coeffBits = singleBits[TEXT_LUMA][0];
5945
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5946
0
        {
5947
0
            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
5948
0
            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
5949
0
        }
5950
5951
        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
5952
        // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for
5953
        // four split block's individual cbf value. This is not known before analysis of four split blocks.
5954
        // For that reason, I am collecting individual coefficient bits only.
5955
0
        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
5956
5957
0
        fullCost.distortion += singleDist[TEXT_LUMA][0];
5958
0
        fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
5959
0
        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
5960
0
        {
5961
0
            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
5962
0
            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
5963
0
        }
5964
5965
0
        if (m_rdCost.m_psyRd)
5966
0
            fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5967
0
        else if(m_rdCost.m_ssimRd)
5968
0
            fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
5969
0
        else
5970
0
            fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
5971
5972
0
        if (m_param->limitTU && bCheckSplit)
5973
0
        {
5974
            // Stop recursion if the TU's energy level is minimal
5975
0
            uint32_t numCoeff = trSize * trSize;
5976
0
            if (cbfFlag[TEXT_LUMA][0] == 0)
5977
0
                bCheckSplit = false;
5978
0
            else if (numSig[TEXT_LUMA][0] < (numCoeff / 64))
5979
0
            {
5980
0
                uint32_t energy = 0;
5981
0
                for (uint32_t i = 0; i < numCoeff; i++)
5982
0
                    energy += abs(coeffCurY[i]);
5983
0
                if (energy == numSig[TEXT_LUMA][0])
5984
0
                    bCheckSplit = false;
5985
0
            }
5986
0
        }
5987
5988
0
        if (bSaveTUData)
5989
0
        {
5990
0
            for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
5991
0
            {
5992
0
                for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
5993
0
                {
5994
0
                    m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part];
5995
0
                    m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part];
5996
0
                }
5997
0
            }
5998
0
            m_cacheTU.cost[idx] = fullCost;
5999
0
            m_entropyCoder.store(m_cacheTU.rqtStore[idx]);
6000
0
        }
6001
0
    }
6002
0
    if (bLoadTUData)
6003
0
    {
6004
0
        for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++)
6005
0
        {
6006
0
            for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++)
6007
0
            {
6008
0
                bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part];
6009
0
                cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part];
6010
0
            }
6011
0
        }
6012
0
        fullCost = m_cacheTU.cost[idx];
6013
0
        m_entropyCoder.load(m_cacheTU.rqtStore[idx]);
6014
0
        bCheckFull = true;
6015
0
    }
6016
6017
    // code sub-blocks
6018
0
    if (bCheckSplit)
6019
0
    {
6020
0
        if (bCheckFull)
6021
0
        {
6022
0
            m_entropyCoder.store(m_rqt[depth].rqtTest);
6023
0
            m_entropyCoder.load(m_rqt[depth].rqtRoot);
6024
0
        }
6025
6026
0
        Cost splitCost;
6027
0
        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
6028
0
        {
6029
            // Subdiv flag can be encoded at the start of analysis of split blocks.
6030
0
            m_entropyCoder.resetBits();
6031
0
            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
6032
0
            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
6033
0
        }
6034
6035
0
        bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0);
6036
0
        if (yCbCrCbf || !bCheckFull)
6037
0
        {
6038
0
            if (splitCost.rdcost < fullCost.rdcost)
6039
0
            {
6040
0
                if (m_limitTU & X265_TU_LIMIT_BFS)
6041
0
                {
6042
0
                    uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1);
6043
0
                    bool nextSplit = nextlog2TrSize > depthRange[0];
6044
0
                    if (nextSplit)
6045
0
                    {
6046
0
                        m_entropyCoder.load(m_rqt[depth].rqtRoot);
6047
0
                        splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0;
6048
0
                        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
6049
0
                        {
6050
                            // Subdiv flag can be encoded at the start of analysis of split blocks.
6051
0
                            m_entropyCoder.resetBits();
6052
0
                            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
6053
0
                            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
6054
0
                        }
6055
0
                        splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1);
6056
0
                    }
6057
0
                }
6058
0
                outCosts.distortion += splitCost.distortion;
6059
0
                outCosts.rdcost     += splitCost.rdcost;
6060
0
                outCosts.bits       += splitCost.bits;
6061
0
                outCosts.energy     += splitCost.energy;
6062
0
                return;
6063
0
            }
6064
0
            else
6065
0
                outCosts.energy     += splitCost.energy;
6066
0
        }
6067
6068
0
        cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
6069
0
        if (codeChroma)
6070
0
        {
6071
0
            if (!splitIntoSubTUs)
6072
0
            {
6073
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
6074
0
                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
6075
0
            }
6076
0
            else
6077
0
            {
6078
0
                uint32_t tuNumParts = absPartIdxStep >> 1;
6079
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
6080
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
6081
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
6082
0
                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
6083
0
            }
6084
0
        }
6085
0
        X265_CHECK(bCheckFull, "check-full must be set\n");
6086
0
        m_entropyCoder.load(m_rqt[depth].rqtTest);
6087
0
    }
6088
6089
0
    cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
6090
0
    cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
6091
6092
0
    if (codeChroma)
6093
0
    {
6094
0
        if (!splitIntoSubTUs)
6095
0
        {
6096
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
6097
0
            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
6098
0
        }
6099
0
        else
6100
0
        {
6101
0
            uint32_t tuNumParts = absPartIdxStep >> 1;
6102
6103
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
6104
0
            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
6105
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
6106
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
6107
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
6108
0
            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
6109
0
        }
6110
0
    }
6111
6112
0
    outCosts.distortion += fullCost.distortion;
6113
0
    outCosts.rdcost     += fullCost.rdcost;
6114
0
    outCosts.bits       += fullCost.bits;
6115
0
    outCosts.energy     += fullCost.energy;
6116
0
}
6117
6118
void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2])
6119
0
{
6120
0
    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
6121
6122
0
    const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
6123
0
    uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
6124
0
    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
6125
0
    {
6126
0
        if (!(log2TrSize - m_hChromaShift < 2))
6127
0
        {
6128
0
            uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
6129
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
6130
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
6131
0
            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
6132
0
                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
6133
0
        }
6134
0
    }
6135
6136
0
    if (!bSubdiv)
6137
0
    {
6138
0
        m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
6139
0
    }
6140
0
    else
6141
0
    {
6142
0
        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
6143
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
6144
0
            codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange);
6145
0
    }
6146
0
}
6147
6148
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth)
6149
0
{
6150
0
    const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
6151
6152
0
    if (tuDepth < cu.m_tuDepth[absPartIdx])
6153
0
    {
6154
0
        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
6155
0
        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
6156
0
            saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1);
6157
0
        return;
6158
0
    }
6159
6160
0
    const uint32_t qtLayer = log2TrSize - 2;
6161
6162
0
    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
6163
0
    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
6164
0
    if (log2TrSizeC < 2)
6165
0
    {
6166
0
        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
6167
0
        log2TrSizeC = 2;
6168
0
        codeChroma &= !(absPartIdx & 3);
6169
0
    }
6170
6171
0
    m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
6172
6173
0
    uint32_t numCoeffY = 1 << (log2TrSize * 2);
6174
0
    uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
6175
0
    coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
6176
0
    coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
6177
0
    memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
6178
6179
0
    if (codeChroma)
6180
0
    {
6181
0
        m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
6182
6183
0
        uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
6184
0
        uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
6185
6186
0
        coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
6187
0
        coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
6188
0
        coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
6189
0
        coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
6190
0
        memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
6191
0
        memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
6192
0
    }
6193
0
}
6194
6195
/* returns the number of bits required to signal a non-most-probable mode.
6196
 * on return mpms contains bitmap of most probable modes */
6197
uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const
6198
0
{
6199
0
    cu.getIntraDirLumaPredictor(absPartIdx, mpmModes);
6200
6201
0
    mpms = 0;
6202
0
    for (int i = 0; i < 3; ++i)
6203
0
        mpms |= ((uint64_t)1 << mpmModes[i]);
6204
6205
0
    return m_entropyCoder.bitsIntraModeNonMPM();
6206
0
}
6207
6208
/* swap the current mode/cost with the mode with the highest cost in the
6209
 * current candidate list, if its cost is better (maintain a top N list) */
6210
void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
6211
0
{
6212
0
    uint32_t maxIndex = 0;
6213
0
    uint64_t maxValue = 0;
6214
6215
0
    for (int i = 0; i < maxCandCount; i++)
6216
0
    {
6217
0
        if (maxValue < candCostList[i])
6218
0
        {
6219
0
            maxValue = candCostList[i];
6220
0
            maxIndex = i;
6221
0
        }
6222
0
    }
6223
6224
0
    if (cost < maxValue)
6225
0
    {
6226
0
        candCostList[maxIndex] = cost;
6227
0
        candModeList[maxIndex] = mode;
6228
0
    }
6229
0
}
6230
6231
void Search::checkDQP(Mode& mode, const CUGeom& cuGeom)
6232
0
{
6233
0
    CUData& cu = mode.cu;
6234
0
    if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth)
6235
0
    {
6236
0
        if (cu.getQtRootCbf(0))
6237
0
        {
6238
0
            if (m_param->rdLevel >= 3)
6239
0
            {
6240
0
                mode.contexts.resetBits();
6241
0
                mode.contexts.codeDeltaQP(cu, 0);
6242
0
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
6243
0
                mode.totalBits += bits;
6244
0
                updateModeCost(mode);
6245
0
            }
6246
0
            else if (m_param->rdLevel <= 1)
6247
0
            {
6248
0
                mode.sa8dBits++;
6249
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
6250
0
            }
6251
0
            else
6252
0
            {
6253
0
                mode.totalBits++;
6254
0
                updateModeCost(mode);
6255
0
            }
6256
0
        }
6257
0
        else
6258
0
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
6259
0
    }
6260
0
}
6261
6262
void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom)
6263
0
{
6264
0
    CUData& cu = mode.cu;
6265
6266
0
    if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP)
6267
0
    {
6268
0
        bool hasResidual = false;
6269
6270
        /* Check if any sub-CU has a non-zero QP */
6271
0
        for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++)
6272
0
        {
6273
0
            if (cu.getQtRootCbf(blkIdx))
6274
0
            {
6275
0
                hasResidual = true;
6276
0
                break;
6277
0
            }
6278
0
        }
6279
0
        if (hasResidual)
6280
0
        {
6281
0
            if (m_param->rdLevel >= 3)
6282
0
            {
6283
0
                mode.contexts.resetBits();
6284
0
                mode.contexts.codeDeltaQP(cu, 0);
6285
0
                uint32_t bits = mode.contexts.getNumberOfWrittenBits();
6286
0
                mode.totalBits += bits;
6287
0
                updateModeCost(mode);
6288
0
            }
6289
0
            else if (m_param->rdLevel <= 1)
6290
0
            {
6291
0
                mode.sa8dBits++;
6292
0
                mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits);
6293
0
            }
6294
0
            else
6295
0
            {
6296
0
                mode.totalBits++;
6297
0
                updateModeCost(mode);
6298
0
            }
6299
            /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled).
6300
            When the non-zero CBF sub-CU is found, stop */
6301
0
            cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
6302
0
        }
6303
0
        else
6304
            /* No residual within this CU or subCU, so reset QP to RefQP */
6305
0
            cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
6306
0
    }
6307
0
}