Coverage Report

Created: 2026-05-16 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/common/quant.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
 *          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
 *
21
 * This program is also available under a commercial proprietary license.
22
 * For more information, contact us at license @ x265.com.
23
 *****************************************************************************/
24
25
#include "common.h"
26
#include "primitives.h"
27
#include "quant.h"
28
#include "framedata.h"
29
#include "entropy.h"
30
#include "yuv.h"
31
#include "cudata.h"
32
#include "contexts.h"
33
34
using namespace X265_NS;
35
36
103k
#define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
37
38
namespace {
39
40
struct coeffGroupRDStats
41
{
42
    int     nnzBeforePos0;     /* indicates coeff other than pos 0 are coded */
43
    int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */
44
    int64_t uncodedDist;       /* uncoded distortion cost of coded coefficients */
45
    int64_t sigCost;           /* cost of signaling significant coeff bitmap */
46
    int64_t sigCost0;          /* cost of signaling sig coeff bit of coeff 0 */
47
};
48
49
inline int fastMin(int x, int y)
50
985k
{
51
985k
    return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
52
985k
}
53
54
inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
55
141k
{
56
141k
    X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
57
141k
    if (!absLevel)
58
1.62k
    {
59
1.62k
        X265_CHECK(diffLevel < 0, "diffLevel check failure\n");
60
1.62k
        return 0;
61
1.62k
    }
62
139k
    int rate = 0;
63
64
139k
    if (diffLevel < 0)
65
11.9k
    {
66
11.9k
        X265_CHECK(absLevel <= 2, "absLevel check failure\n");
67
11.9k
        rate += greaterOneBits[(absLevel == 2)];
68
69
11.9k
        if (absLevel == 2)
70
4.22k
            rate += levelAbsBits[0];
71
11.9k
    }
72
127k
    else
73
127k
    {
74
127k
        uint32_t symbol = diffLevel;
75
127k
        bool expGolomb = (symbol > maxVlc);
76
77
127k
        if (expGolomb)
78
108k
        {
79
108k
            absLevel = symbol - maxVlc;
80
81
            // NOTE: mapping to x86 hardware instruction BSR
82
108k
            unsigned long size;
83
108k
            BSR(size, absLevel);
84
108k
            int egs = size * 2 + 1;
85
86
108k
            rate += egs << 15;
87
88
            // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
89
108k
            X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n");
90
108k
            symbol = maxVlc + 1;
91
108k
        }
92
93
127k
        uint32_t prefLen = (symbol >> absGoRice) + 1;
94
127k
        uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
95
96
127k
        rate += numBins << 15;
97
127k
        rate += c1c2Rate;
98
127k
    }
99
139k
    return rate;
100
141k
}
101
102
#if CHECKED_BUILD || _DEBUG
103
inline int getICRateNegDiff(uint32_t absLevel, const int* greaterOneBits, const int* levelAbsBits)
104
{
105
    X265_CHECK(absLevel <= 2, "absLevel check failure\n");
106
107
    int rate;
108
    if (absLevel == 0)
109
        rate = 0;
110
    else if (absLevel == 2)
111
        rate = greaterOneBits[1] + levelAbsBits[0];
112
    else
113
        rate = greaterOneBits[0];
114
    return rate;
115
}
116
#endif
117
118
inline int getICRateLessVlc(uint32_t absLevel, int32_t diffLevel, const uint32_t absGoRice)
119
94.8k
{
120
94.8k
    X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
121
94.8k
    if (!absLevel)
122
0
    {
123
0
        X265_CHECK(diffLevel < 0, "diffLevel check failure\n");
124
0
        return 0;
125
0
    }
126
94.8k
    int rate;
127
128
94.8k
    uint32_t symbol = diffLevel;
129
94.8k
    uint32_t prefLen = (symbol >> absGoRice) + 1;
130
94.8k
    uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
131
132
94.8k
    rate = numBins << 15;
133
134
94.8k
    return rate;
135
94.8k
}
136
137
/* Calculates the cost for specific absolute transform level */
138
inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
139
155k
{
140
155k
    X265_CHECK(absLevel, "absLevel should not be zero\n");
141
142
155k
    if (diffLevel < 0)
143
11.7k
    {
144
11.7k
        X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n");
145
146
11.7k
        uint32_t rate = greaterOneBits[(absLevel == 2)];
147
11.7k
        if (absLevel == 2)
148
5.02k
            rate += levelAbsBits[0];
149
11.7k
        return rate;
150
11.7k
    }
151
144k
    else
152
144k
    {
153
144k
        uint32_t rate;
154
144k
        uint32_t symbol = diffLevel;
155
144k
        if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION)
156
49.5k
        {
157
49.5k
            uint32_t length = symbol >> absGoRice;
158
49.5k
            rate = (length + 1 + absGoRice) << 15;
159
49.5k
        }
160
94.6k
        else
161
94.6k
        {
162
94.6k
            uint32_t length = 0;
163
94.6k
            symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION;
164
94.6k
            if (symbol)
165
86.4k
            {
166
86.4k
                unsigned long idx;
167
86.4k
                BSR(idx, symbol + 1);
168
86.4k
                length = idx;
169
86.4k
            }
170
171
94.6k
            rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
172
94.6k
        }
173
144k
        rate += c1c2Rate;
174
144k
        return rate;
175
144k
    }
176
155k
}
177
178
}
179
180
Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
181
182
Quant::Quant()
183
19.6k
{
184
19.6k
    m_resiDctCoeff = NULL;
185
19.6k
    m_fencDctCoeff = NULL;
186
19.6k
    m_fencShortBuf = NULL;
187
19.6k
    m_frameNr      = NULL;
188
19.6k
    m_nr           = NULL;
189
19.6k
}
190
191
bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy)
192
19.6k
{
193
19.6k
    m_entropyCoder = &entropy;
194
19.6k
    m_psyRdoqScale = (int32_t)(psyScale * 256.0);
195
19.6k
    X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n");
196
19.6k
    m_scalingList  = &scalingList;
197
19.6k
    m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
198
19.6k
    m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
199
19.6k
    m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
200
201
19.6k
    return m_resiDctCoeff && m_fencShortBuf;
202
19.6k
}
203
204
bool Quant::allocNoiseReduction(const x265_param& param)
205
0
{
206
0
    m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads);
207
0
    if (m_frameNr)
208
0
        memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads);
209
0
    else
210
0
        return false;
211
0
    return true;
212
0
}
213
214
Quant::~Quant()
215
19.6k
{
216
19.6k
    X265_FREE(m_frameNr);
217
19.6k
    X265_FREE(m_resiDctCoeff);
218
19.6k
    X265_FREE(m_fencShortBuf);
219
19.6k
}
220
221
void Quant::setQPforQuant(const CUData& ctu, int qp)
222
24.6k
{
223
24.6k
    m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
224
24.6k
    m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
225
24.6k
    m_rdoqLevel = ctu.m_encData->m_param->rdoqLevel;
226
24.6k
    if (ctu.m_chromaFormat != X265_CSP_I400)
227
24.6k
    {
228
24.6k
        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0] + ctu.m_slice->m_chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
229
24.6k
        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1] + ctu.m_slice->m_chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
230
24.6k
    }
231
24.6k
}
232
233
void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
234
49.3k
{
235
49.3k
    int qp = x265_clip3(-QP_BD_OFFSET, 57, qpin);
236
49.3k
    if (qp >= 30)
237
11.4k
    {
238
11.4k
        if (chFmt == X265_CSP_I420)
239
11.4k
            qp = g_chromaScale[qp];
240
0
        else
241
0
            qp = X265_MIN(qp, QP_MAX_SPEC);
242
11.4k
    }
243
49.3k
    m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET);
244
49.3k
}
245
246
/* To minimize the distortion only. No rate is considered */
247
uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams, uint32_t log2TrSize)
248
0
{
249
0
    uint32_t trSize = 1 << log2TrSize;
250
0
    const uint16_t* scan = codeParams.scan;
251
252
0
    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
253
0
    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
254
0
    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
255
256
#if CHECKED_BUILD || _DEBUG
257
    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
258
    memset(coeffNum, 0, sizeof(coeffNum) * sizeof(uint8_t));
259
    memset(coeffSign, 0, sizeof(coeffNum) * sizeof(uint16_t));
260
    memset(coeffFlag, 0, sizeof(coeffNum) * sizeof(uint16_t));
261
#endif
262
0
    const int lastScanPos = primitives.scanPosLast(codeParams.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
263
0
    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
264
0
    unsigned long tmp;
265
266
    // first CG need specially processing
267
0
    const uint32_t correctOffset = 0x0F & (lastScanPos ^ 0xF);
268
0
    coeffFlag[cgLastScanPos] <<= correctOffset;
269
270
0
    for (int cg = cgLastScanPos; cg >= 0; cg--)
271
0
    {
272
0
        int cgStartPos = cg << LOG2_SCAN_SET_SIZE;
273
0
        int n;
274
275
#if CHECKED_BUILD || _DEBUG
276
        for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
277
            if (coeff[scan[n + cgStartPos]])
278
                break;
279
        int lastNZPosInCG0 = n;
280
#endif
281
282
0
        if (coeffNum[cg] == 0)
283
0
        {
284
0
            X265_CHECK(lastNZPosInCG0 < 0, "all zero block check failure\n");
285
0
            continue;
286
0
        }
287
288
#if CHECKED_BUILD || _DEBUG
289
        for (n = 0;; n++)
290
            if (coeff[scan[n + cgStartPos]])
291
                break;
292
293
        int firstNZPosInCG0 = n;
294
#endif
295
296
0
        BSR(tmp, coeffFlag[cg]);
297
0
        const int firstNZPosInCG = (15 ^ tmp);
298
299
0
        BSF(tmp, coeffFlag[cg]);
300
0
        const int lastNZPosInCG = (15 ^ tmp);
301
302
0
        X265_CHECK(firstNZPosInCG0 == firstNZPosInCG, "firstNZPosInCG0 check failure\n");
303
0
        X265_CHECK(lastNZPosInCG0 == lastNZPosInCG, "lastNZPosInCG0 check failure\n");
304
305
0
        if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
306
0
        {
307
0
            uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1;
308
0
            uint32_t absSum = 0;
309
310
0
            for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
311
0
                absSum += coeff[scan[n + cgStartPos]];
312
313
0
            if (signbit != (absSum & 0x1)) // compare signbit with sum_parity
314
0
            {
315
0
                int minCostInc = MAX_INT,  minPos = -1, curCost = MAX_INT;
316
0
                int32_t finalChange = 0, curChange = 0;
317
0
                uint32_t cgFlags = coeffFlag[cg];
318
0
                if (cg == cgLastScanPos)
319
0
                    cgFlags >>= correctOffset;
320
321
0
                for (n = (cg == cgLastScanPos ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
322
0
                {
323
0
                    uint32_t blkPos = scan[n + cgStartPos];
324
0
                    X265_CHECK(!!coeff[blkPos] == !!(cgFlags & 1), "non zero coeff check failure\n");
325
326
0
                    if (cgFlags & 1)
327
0
                    {
328
0
                        if (deltaU[blkPos] > 0)
329
0
                        {
330
0
                            curCost = -deltaU[blkPos];
331
0
                            curChange = 1;
332
0
                        }
333
0
                        else
334
0
                        {
335
0
                            if ((cgFlags == 1) && (abs(coeff[blkPos]) == 1))
336
0
                            {
337
0
                                X265_CHECK(n == firstNZPosInCG, "firstNZPosInCG position check failure\n");
338
0
                                curCost = MAX_INT;
339
0
                            }
340
0
                            else
341
0
                            {
342
0
                                curCost = deltaU[blkPos];
343
0
                                curChange = -1;
344
0
                            }
345
0
                        }
346
0
                    }
347
0
                    else
348
0
                    {
349
0
                        if (cgFlags == 0)
350
0
                        {
351
0
                            X265_CHECK(n < firstNZPosInCG, "firstNZPosInCG position check failure\n");
352
0
                            uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1;
353
0
                            if (thisSignBit != signbit)
354
0
                                curCost = MAX_INT;
355
0
                            else
356
0
                            {
357
0
                                curCost = -deltaU[blkPos];
358
0
                                curChange = 1;
359
0
                            }
360
0
                        }
361
0
                        else
362
0
                        {
363
0
                            curCost = -deltaU[blkPos];
364
0
                            curChange = 1;
365
0
                        }
366
0
                    }
367
368
0
                    if (curCost < minCostInc)
369
0
                    {
370
0
                        minCostInc = curCost;
371
0
                        finalChange = curChange;
372
0
                        minPos = blkPos;
373
0
                    }
374
0
                    cgFlags>>=1;
375
0
                }
376
377
                /* do not allow change to violate coeff clamp */
378
0
                if (coeff[minPos] == 32767 || coeff[minPos] == -32768)
379
0
                    finalChange = -1;
380
381
0
                if (!coeff[minPos])
382
0
                    numSig++;
383
0
                else if (finalChange == -1 && abs(coeff[minPos]) == 1)
384
0
                    numSig--;
385
386
0
                {
387
0
                    const int16_t sigMask = ((int16_t)m_resiDctCoeff[minPos]) >> 15;
388
0
                    coeff[minPos] += ((int16_t)finalChange ^ sigMask) - sigMask;
389
0
                }
390
0
            }
391
0
        }
392
0
    }
393
394
0
    return numSig;
395
0
}
396
397
uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
398
                             coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
399
10.3M
{
400
10.3M
    const uint32_t sizeIdx = log2TrSize - 2;
401
402
10.3M
    if (cu.m_tqBypass[0])
403
2.75M
    {
404
2.75M
        X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
405
2.75M
        return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride);
406
2.75M
    }
407
408
7.63M
    bool isLuma  = ttype == TEXT_LUMA;
409
7.63M
    bool usePsy  = m_psyRdoqScale && isLuma && !useTransformSkip;
410
7.63M
    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
411
412
7.63M
    X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
413
7.63M
    if (useTransformSkip)
414
0
    {
415
0
#if X265_DEPTH <= 10
416
0
        X265_CHECK(transformShift >= 0, "invalid transformShift\n");
417
0
        primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
418
#else
419
        if (transformShift >= 0)
420
            primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift);
421
        else
422
            primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift);
423
#endif
424
0
    }
425
7.63M
    else
426
7.63M
    {
427
7.63M
        bool isIntra = cu.isIntra(absPartIdx);
428
429
7.63M
        if (!sizeIdx && isLuma && isIntra)
430
2.37M
            primitives.dst4x4(residual, m_resiDctCoeff, resiStride);
431
5.26M
        else
432
5.26M
            primitives.cu[sizeIdx].dct(residual, m_resiDctCoeff, resiStride);
433
434
        /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
435
         * there is no risk of performing this DCT unnecessarily */
436
7.63M
        if (usePsy)
437
3.12M
        {
438
3.12M
            int trSize = 1 << log2TrSize;
439
            /* perform DCT on source pixels for psy-rdoq */
440
3.12M
            primitives.cu[sizeIdx].copy_ps(m_fencShortBuf, trSize, fenc, fencStride);
441
3.12M
            primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
442
3.12M
        }
443
444
7.63M
        if (m_nr && m_nr->offset)
445
0
        {
446
            /* denoise is not applied to intra residual, so DST can be ignored */
447
0
            int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
448
0
            int numCoeff = 1 << (log2TrSize * 2);
449
0
            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
450
0
            m_nr->count[cat]++;
451
0
        }
452
7.63M
    }
453
454
7.63M
    if (m_rdoqLevel)
455
7.63M
        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
456
18.4E
    else
457
18.4E
    {
458
18.4E
        int deltaU[32 * 32];
459
460
18.4E
        int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
461
18.4E
        int rem = m_qpParam[ttype].rem;
462
18.4E
        int per = m_qpParam[ttype].per;
463
18.4E
        const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
464
465
18.4E
        int qbits = QUANT_SHIFT + per + transformShift;
466
18.4E
        int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
467
18.4E
        int numCoeff = 1 << (log2TrSize * 2);
468
469
18.4E
        uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff);
470
471
18.4E
        if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled)
472
0
        {
473
0
            TUEntropyCodingParameters codeParams;
474
0
            cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma);
475
0
            return signBitHidingHDQ(coeff, deltaU, numSig, codeParams, log2TrSize);
476
0
        }
477
18.4E
        else
478
18.4E
            return numSig;
479
18.4E
    }
480
7.63M
}
481
482
uint64_t Quant::ssimDistortion(const CUData& cu, const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx)
483
0
{
484
0
    static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416
485
0
    static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963
486
0
    int shift = (X265_DEPTH - 8);
487
488
0
    int trSize = 1 << log2TrSize;
489
0
    uint64_t ssDc = 0, ssBlock = 0, ssAc = 0;
490
491
    // Calculation of (X(0) - Y(0)) * (X(0) - Y(0)), DC
492
0
    ssDc = 0;
493
0
    for (int y = 0; y < trSize; y += 4)
494
0
    {
495
0
        for (int x = 0; x < trSize; x += 4)
496
0
        {
497
0
            int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff
498
0
            ssDc += temp * temp;
499
0
        }
500
0
    }
501
502
    // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC
503
0
    ssBlock = 0;
504
0
    uint64_t ac_k = 0;
505
0
    primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride, &ssBlock, shift, &ac_k);
506
0
    ssAc = ssBlock - ssDc;
507
508
    // 1. Calculation of fdc'
509
    // Calculate numerator of dc normalization factor
510
0
    uint64_t fDc_num = 0;
511
512
    // 2. Calculate dc component
513
0
    uint64_t dc_k = 0;
514
0
    for (int block_yy = 0; block_yy < trSize; block_yy += 4)
515
0
    {
516
0
        for (int block_xx = 0; block_xx < trSize; block_xx += 4)
517
0
        {
518
0
            uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift;
519
0
            dc_k += temp * temp;
520
0
        }
521
0
    }
522
523
0
    fDc_num = (2 * dc_k)  + (trSize * trSize * ssim_c1); // 16 pixels -> for each 4x4 block
524
0
    fDc_num /= ((trSize >> 2) * (trSize >> 2));
525
526
    // 1. Calculation of fac'
527
    // Calculate numerator of ac normalization factor
528
0
    uint64_t fAc_num = 0;
529
530
    // 2. Calculate ac component
531
0
    ac_k -= dc_k;
532
533
0
    double s = 1 + 0.005 * cu.m_qp[absPartIdx];
534
535
0
    fAc_num = ac_k + uint64_t(s * ac_k) + ssim_c2;
536
0
    fAc_num /= ((trSize >> 2) * (trSize >> 2));
537
538
    // Calculate dc and ac normalization factor
539
0
    uint64_t ssim_distortion = ((ssDc * cu.m_fDc_den[ttype]) / fDc_num) + ((ssAc * cu.m_fAc_den[ttype]) / fAc_num);
540
0
    return ssim_distortion;
541
0
}
542
543
void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
544
                            uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
545
45.8k
{
546
45.8k
    const uint32_t sizeIdx = log2TrSize - 2;
547
45.8k
    if (cu.m_tqBypass[0])
548
10.2k
    {
549
10.2k
        primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
550
10.2k
        return;
551
10.2k
    }
552
    // Values need to pass as input parameter in dequant
553
35.5k
    int rem = m_qpParam[ttype].rem;
554
35.5k
    int per = m_qpParam[ttype].per;
555
35.5k
    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
556
35.5k
    int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
557
35.5k
    int numCoeff = 1 << (log2TrSize * 2);
558
559
35.5k
    if (m_scalingList->m_bEnabled)
560
0
    {
561
0
        int scalingListType = (bIntra ? 0 : 3) + ttype;
562
0
        const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
563
0
        primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
564
0
    }
565
35.5k
    else
566
35.5k
    {
567
35.5k
        int scale = m_scalingList->s_invQuantScales[rem] << per;
568
35.5k
        primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift);
569
35.5k
    }
570
571
35.5k
    if (useTransformSkip)
572
0
    {
573
0
#if X265_DEPTH <= 10
574
0
        X265_CHECK(transformShift > 0, "invalid transformShift\n");
575
0
        primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
576
#else
577
        if (transformShift > 0)
578
            primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
579
        else
580
            primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
581
#endif
582
0
    }
583
35.5k
    else
584
35.5k
    {
585
35.5k
        int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
586
35.5k
        X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n");
587
        // DC only
588
35.5k
        if (numSig == 1 && coeff[0] != 0 && !useDST)
589
30.0k
        {
590
30.0k
            const int shift_1st = 7 - 6;
591
30.0k
            const int add_1st = 1 << (shift_1st - 1);
592
30.0k
            const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
593
30.0k
            const int add_2nd = 1 << (shift_2nd - 1);
594
595
30.0k
            int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
596
30.0k
            primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val);
597
30.0k
            return;
598
30.0k
        }
599
600
5.46k
        if (useDST)
601
5.46k
            primitives.idst4x4(m_resiDctCoeff, residual, resiStride);
602
0
        else
603
0
            primitives.cu[sizeIdx].idct(m_resiDctCoeff, residual, resiStride);
604
5.46k
    }
605
35.5k
}
606
607
/* Rate distortion optimized quantization for entropy coding engines using
608
 * probability models like CABAC */
609
template<uint32_t log2TrSize>
610
uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
611
7.63M
{
612
7.63M
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613
7.63M
    int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
614
7.63M
    const uint32_t usePsyMask = usePsy ? -1 : 0;
615
616
7.63M
    X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
617
618
7.63M
    int rem = m_qpParam[ttype].rem;
619
7.63M
    int per = m_qpParam[ttype].per;
620
7.63M
    int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
621
7.63M
    int add = (1 << (qbits - 1));
622
7.63M
    const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
623
624
7.63M
    const int numCoeff = 1 << (log2TrSize * 2);
625
7.63M
    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
626
7.63M
    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
627
7.63M
    if (!numSig)
628
7.58M
        return 0;
629
49.4k
    const uint32_t trSize = 1 << log2TrSize;
630
49.4k
    int64_t lambda2 = m_qpParam[ttype].lambda2;
631
49.4k
    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
632
    /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
633
     * scale applied that must be removed during unquant. Note that in real dequant there is clipping
634
     * at several stages. We skip the clipping for simplicity when measuring RD cost */
635
49.4k
    const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
636
49.4k
    int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
637
49.4k
    int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
638
49.4k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
639
640
49.4k
#define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
641
468k
#define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
642
185k
#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
643
381k
#define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
644
645
49.4k
    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
646
49.4k
    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
647
49.4k
    int64_t costSig[trSize * trSize];     /* lambda * bits       */
648
649
49.4k
    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
650
49.4k
    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
651
49.4k
    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
652
653
49.4k
    int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
654
49.4k
    uint64_t sigCoeffGroupFlag64 = 0;
655
656
49.4k
    const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
657
49.4k
    bool bIsLuma = ttype == TEXT_LUMA;
658
659
    /* total rate distortion cost of transform block, as CBF=0 */
660
49.4k
    int64_t totalUncodedCost = 0;
661
662
    /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
663
     * the distortion and signal cost of coded blocks, and the coding cost of significant
664
     * coefficient and coefficient group bitmaps */
665
49.4k
    int64_t totalRdCost = 0;
666
667
49.4k
    TUEntropyCodingParameters codeParams;
668
49.4k
    cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
669
49.4k
    const uint32_t log2TrSizeCG = log2TrSize - 2;
670
49.4k
    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
671
49.4k
    const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
672
673
49.4k
    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
674
49.4k
    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
675
49.4k
    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
676
677
#if CHECKED_BUILD || _DEBUG
678
    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
679
    memset(coeffNum, 0, sizeof(coeffNum) * sizeof(uint8_t));
680
    memset(coeffSign, 0, sizeof(coeffNum) * sizeof(uint16_t));
681
    memset(coeffFlag, 0, sizeof(coeffNum) * sizeof(uint16_t));
682
#endif
683
49.4k
    const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
684
49.4k
    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
685
686
687
    /* TODO: update bit estimates if dirty */
688
49.4k
    EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
689
690
49.4k
    uint32_t scanPos = 0;
691
49.4k
    uint32_t c1 = 1;
692
693
    // process trail all zero Coeff Group
694
695
    /* coefficients after lastNZ have no distortion signal cost */
696
49.4k
    const int zeroCG = cgNum - 1 - cgLastScanPos;
697
49.4k
    memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
698
49.4k
    memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
699
700
    /* sum zero coeff (uncodec) cost */
701
702
    // TODO: does we need these cost?
703
49.4k
    if (usePsyMask)
704
18.5k
    {
705
264k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
706
246k
        {
707
246k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
708
246k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
709
246k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
710
246k
#if X265_ARCH_X86
711
246k
            bool enable512 = detect512();
712
246k
            if (enable512)
713
0
                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
714
246k
            else
715
246k
            {
716
246k
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
717
246k
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
718
246k
            }
719
#else
720
            primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
721
            primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
722
#endif
723
246k
        }
724
18.5k
    }
725
30.8k
    else
726
30.8k
    {
727
        // non-psy path
728
140k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
729
109k
        {
730
109k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
731
109k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
732
109k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
733
109k
            primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
734
109k
        }
735
30.8k
    }
736
49.4k
    static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
737
49.4k
    {
738
        // patternSigCtx = 0
739
49.4k
        {
740
49.4k
            2, 1, 1, 0,
741
49.4k
            1, 1, 0, 0,
742
49.4k
            1, 0, 0, 0,
743
49.4k
            0, 0, 0, 0,
744
49.4k
        },
745
        // patternSigCtx = 1
746
49.4k
        {
747
49.4k
            2, 2, 2, 2,
748
49.4k
            1, 1, 1, 1,
749
49.4k
            0, 0, 0, 0,
750
49.4k
            0, 0, 0, 0,
751
49.4k
        },
752
        // patternSigCtx = 2
753
49.4k
        {
754
49.4k
            2, 1, 0, 0,
755
49.4k
            2, 1, 0, 0,
756
49.4k
            2, 1, 0, 0,
757
49.4k
            2, 1, 0, 0,
758
49.4k
        },
759
        // patternSigCtx = 3
760
49.4k
        {
761
49.4k
            2, 2, 2, 2,
762
49.4k
            2, 2, 2, 2,
763
49.4k
            2, 2, 2, 2,
764
49.4k
            2, 2, 2, 2,
765
49.4k
        },
766
        // 4x4
767
49.4k
        {
768
49.4k
            0, 1, 4, 5,
769
49.4k
            2, 3, 4, 5,
770
49.4k
            6, 6, 8, 8,
771
49.4k
            7, 7, 8, 8
772
49.4k
        }
773
49.4k
    };
774
775
    /* iterate over coding groups in reverse scan order */
776
99.7k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
777
50.3k
    {
778
50.3k
        uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
779
50.3k
        const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
780
50.3k
        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
781
50.3k
        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
782
50.3k
        const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
783
50.3k
        const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
784
50.3k
        const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
785
786
50.3k
        if (c1 == 0)
787
0
            ctxSet++;
788
50.3k
        c1 = 1;
789
790
50.3k
        if (cgScanPos && (coeffNum[cgScanPos] == 0))
791
0
        {
792
            // TODO: does we need zero-coeff cost?
793
0
            const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
794
0
            uint32_t blkPos = codeParams.scan[scanPosBase];
795
0
            if (usePsyMask)
796
0
            {
797
0
#if X265_ARCH_X86
798
0
                bool enable512 = detect512();
799
0
                if (enable512)
800
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
801
0
                else
802
0
                {
803
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
804
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
805
0
                }
806
#else
807
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
808
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
809
#endif
810
0
                blkPos = codeParams.scan[scanPosBase];
811
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
812
0
                {
813
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
814
0
                    {
815
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
816
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
817
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
818
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
819
820
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
821
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
822
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
823
0
                    }
824
0
                    blkPos += trSize;
825
0
                }
826
0
            }
827
0
            else
828
0
            {
829
                // non-psy path
830
0
                primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
831
0
                blkPos = codeParams.scan[scanPosBase];
832
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
833
0
                {
834
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
835
0
                    {
836
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
837
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
838
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
839
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
840
841
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
842
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
843
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
844
0
                    }
845
0
                    blkPos += trSize;
846
0
                }
847
0
            }
848
849
            /* there were no coded coefficients in this coefficient group */
850
0
            {
851
0
                uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
852
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
853
0
                totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
854
0
            }
855
0
            continue;
856
0
        }
857
858
50.3k
        coeffGroupRDStats cgRdStats;
859
50.3k
        memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
860
861
50.3k
        uint32_t subFlagMask = coeffFlag[cgScanPos];
862
50.3k
        int    c2            = 0;
863
50.3k
        uint32_t goRiceParam = 0;
864
50.3k
        uint32_t levelThreshold = 3;
865
50.3k
        uint32_t c1Idx       = 0;
866
50.3k
        uint32_t c2Idx       = 0;
867
        /* iterate over coefficients in each group in reverse scan order */
868
855k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
869
804k
        {
870
804k
            scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
871
804k
            uint32_t blkPos      = codeParams.scan[scanPos];
872
804k
            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
873
804k
            int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
874
804k
            int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
875
876
            /* RDOQ measures distortion as the squared difference between the unquantized coded level
877
             * and the original DCT coefficient. The result is shifted scaleBits to account for the
878
             * FIX15 nature of the CABAC cost tables minus the forward transform scale */
879
880
            /* cost of not coding this coefficient (all distortion, no signal bits) */
881
804k
            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
882
804k
            X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
883
804k
            if (usePsyMask & scanPos)
884
                /* when no residual coefficient is coded, predicted coef == recon coef */
885
278k
                costUncoded[blkPos] -= PSYVALUE(predictedCoef);
886
887
804k
            totalUncodedCost += costUncoded[blkPos];
888
889
            // coefficient level estimation
890
804k
            const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
891
            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
892
804k
            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
893
804k
            uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
894
804k
            const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
895
            // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
896
804k
            X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
897
898
            // before find lastest non-zero coeff
899
804k
            if (scanPos > (uint32_t)lastScanPos)
900
695k
            {
901
                /* coefficients after lastNZ have no distortion signal cost */
902
695k
                costCoeff[scanPos] = 0;
903
695k
                costSig[scanPos] = 0;
904
905
                /* No non-zero coefficient yet found, but this does not mean
906
                 * there is no uncoded-cost for this coefficient. Pre-
907
                 * quantization the coefficient may have been non-zero */
908
695k
                totalRdCost += costUncoded[blkPos];
909
695k
            }
910
109k
            else if (!(subFlagMask & 1))
911
1.80k
            {
912
                // fast zero coeff path
913
                /* set default costs to uncoded costs */
914
1.80k
                costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
915
1.80k
                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
916
1.80k
                sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
917
1.80k
                totalRdCost += costCoeff[scanPos];
918
1.80k
                rateIncUp[blkPos] = greaterOneBits[0];
919
920
1.80k
                subFlagMask >>= 1;
921
1.80k
            }
922
107k
            else
923
107k
            {
924
107k
                subFlagMask >>= 1;
925
926
107k
                const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
927
107k
                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
928
929
107k
                X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
930
107k
                X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
931
107k
                X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
932
107k
                X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
933
934
                // coefficient level estimation
935
107k
                const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
936
107k
                const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
937
938
107k
                uint32_t level = 0;
939
107k
                uint32_t sigCoefBits = 0;
940
107k
                costCoeff[scanPos] = MAX_INT64;
941
942
107k
                if ((int)scanPos == lastScanPos)
943
50.3k
                    sigRateDelta[blkPos] = 0;
944
56.9k
                else
945
56.9k
                {
946
56.9k
                    if (maxAbsLevel < 3)
947
14.0k
                    {
948
                        /* set default costs to uncoded costs */
949
14.0k
                        costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
950
14.0k
                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
951
14.0k
                    }
952
56.9k
                    sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
953
56.9k
                    sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
954
56.9k
                }
955
956
107k
                const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
957
                // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
958
107k
                if (maxAbsLevel == 1)
959
29.4k
                {
960
29.4k
                    uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
961
29.4k
                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
962
963
29.4k
                    int unquantAbsLevel = unQuantLevel >> unquantShift;
964
29.4k
                    X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
965
29.4k
                    int d = abs(signCoef) - unquantAbsLevel;
966
29.4k
                    int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
967
968
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
969
29.4k
                    if (usePsyMask & scanPos)
970
11.0k
                    {
971
11.0k
                        int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
972
11.0k
                        curCost -= PSYVALUE(reconCoef);
973
11.0k
                    }
974
975
29.4k
                    if (curCost < costCoeff[scanPos])
976
28.7k
                    {
977
28.7k
                        level = 1;
978
28.7k
                        costCoeff[scanPos] = curCost;
979
28.7k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
980
28.7k
                    }
981
29.4k
                }
982
77.8k
                else if (maxAbsLevel)
983
77.9k
                {
984
77.9k
                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
985
77.9k
                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
986
987
77.9k
                    const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
988
989
77.9k
                    const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
990
77.9k
                    X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
991
77.9k
                    int d0 = abs(signCoef) - unquantAbsLevel0;
992
77.9k
                    int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
993
994
77.9k
                    const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
995
77.9k
                    X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
996
77.9k
                    int d1 = abs(signCoef) - unquantAbsLevel1;
997
77.9k
                    int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
998
999
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
1000
77.9k
                    if (usePsyMask & scanPos)
1001
46.1k
                    {
1002
46.1k
                        int reconCoef;
1003
46.1k
                        reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef));
1004
46.1k
                        curCost0 -= PSYVALUE(reconCoef);
1005
1006
46.1k
                        reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef));
1007
46.1k
                        curCost1 -= PSYVALUE(reconCoef);
1008
46.1k
                    }
1009
77.9k
                    if (curCost0 < costCoeff[scanPos])
1010
77.9k
                    {
1011
77.9k
                        level = maxAbsLevel;
1012
77.9k
                        costCoeff[scanPos] = curCost0;
1013
77.9k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1014
77.9k
                    }
1015
77.9k
                    if (curCost1 < costCoeff[scanPos])
1016
1.56k
                    {
1017
1.56k
                        level = maxAbsLevel - 1;
1018
1.56k
                        costCoeff[scanPos] = curCost1;
1019
1.56k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1020
1.56k
                    }
1021
77.9k
                }
1022
1023
107k
                dstCoeff[blkPos] = (int16_t)level;
1024
107k
                totalRdCost += costCoeff[scanPos];
1025
1026
                /* record costs for sign-hiding performed at the end */
1027
18.4E
                if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level)
1028
106k
                {
1029
106k
                    const int32_t diff0 = level - 1 - baseLevel;
1030
106k
                    const int32_t diff2 = level + 1 - baseLevel;
1031
106k
                    const int32_t maxVlc = g_goRiceRange[goRiceParam];
1032
106k
                    int rate0, rate1, rate2;
1033
1034
106k
                    if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
1035
28.0k
                    {
1036
                        // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
1037
                        //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
1038
28.0k
                        X265_CHECK(level == 1, "absLevel check failure\n");
1039
1040
28.0k
                        const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
1041
28.0k
                        const int rateNotEqual2 = greaterOneBits[0];
1042
1043
28.0k
                        rate0 = 0;
1044
28.0k
                        rate2 = rateEqual2;
1045
28.0k
                        rate1 = rateNotEqual2;
1046
1047
28.0k
                        X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1048
28.0k
                        X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1049
28.0k
                        X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1050
28.0k
                    }
1051
78.6k
                    else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
1052
31.6k
                    {
1053
                        // NOTE: no c1c2 correct rate since all of rate include this factor
1054
31.6k
                        rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
1055
31.6k
                        rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
1056
31.6k
                        rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
1057
31.6k
                    }
1058
47.0k
                    else
1059
47.0k
                    {
1060
47.0k
                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1061
47.0k
                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1062
47.0k
                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1063
47.0k
                    }
1064
106k
                    rateIncUp[blkPos] = rate2 - rate1;
1065
106k
                    rateIncDown[blkPos] = rate0 - rate1;
1066
106k
                }
1067
584
                else
1068
584
                {
1069
584
                    rateIncUp[blkPos] = greaterOneBits[0];
1070
584
                    rateIncDown[blkPos] = 0;
1071
584
                }
1072
1073
                /* Update CABAC estimation state */
1074
107k
                if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
1075
42.5k
                {
1076
42.5k
                    goRiceParam++;
1077
42.5k
                    levelThreshold <<= 1;
1078
42.5k
                }
1079
1080
107k
                const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
1081
107k
                c1Idx += isNonZero;
1082
1083
                /* update bin model */
1084
107k
                if (level > 1)
1085
77.0k
                {
1086
77.0k
                    c1 = 0;
1087
77.0k
                    c2 += (uint32_t)(c2 - 2) >> 31;
1088
77.0k
                    c2Idx++;
1089
77.0k
                }
1090
30.2k
                else if (((c1 == 1) | (c1 == 2)) & isNonZero)
1091
24.7k
                    c1++;
1092
1093
107k
                if (dstCoeff[blkPos])
1094
106k
                {
1095
106k
                    sigCoeffGroupFlag64 |= cgBlkPosMask;
1096
106k
                    cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
1097
106k
                    cgRdStats.uncodedDist += costUncoded[blkPos];
1098
106k
                    cgRdStats.nnzBeforePos0 += scanPosinCG;
1099
106k
                }
1100
107k
            }
1101
1102
804k
            cgRdStats.sigCost += costSig[scanPos];
1103
804k
        } /* end for (scanPosinCG) */
1104
1105
50.3k
        X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n");
1106
50.3k
        cgRdStats.sigCost0 = costSig[scanPos];
1107
1108
50.3k
        costCoeffGroupSig[cgScanPos] = 0;
1109
1110
        /* nothing to do at this case */
1111
50.3k
        X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n");
1112
1113
50.3k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1114
50.3k
        {
1115
            /* coeff group 0 is implied to be present, no signal cost */
1116
            /* coeff group with last NZ is implied to be present, handled below */
1117
50.3k
        }
1118
0
        else if (sigCoeffGroupFlag64 & cgBlkPosMask)
1119
0
        {
1120
0
            if (!cgRdStats.nnzBeforePos0)
1121
0
            {
1122
                /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
1123
0
                totalRdCost -= cgRdStats.sigCost0;
1124
0
                cgRdStats.sigCost -= cgRdStats.sigCost0;
1125
0
            }
1126
1127
            /* there are coded coefficients in this group, but now we include the signaling cost
1128
             * of the significant coefficient group flag and evaluate whether the RD cost of the
1129
             * coded group is more than the RD cost of the uncoded group */
1130
1131
0
            uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1132
1133
0
            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1134
0
            costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
1135
0
            costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
1136
0
            costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
1137
1138
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
1139
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
1140
1141
0
            if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
1142
0
            {
1143
0
                sigCoeffGroupFlag64 &= ~cgBlkPosMask;
1144
0
                totalRdCost = costZeroCG;
1145
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1146
1147
                /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
1148
0
                const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
1149
0
                memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1150
0
                memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1151
0
                memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1152
0
                memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1153
0
            }
1154
0
        }
1155
0
        else
1156
0
        {
1157
            /* there were no coded coefficients in this coefficient group */
1158
0
            uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1159
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
1160
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
1161
0
            totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
1162
0
        }
1163
50.3k
    } /* end for (cgScanPos) */
1164
1165
49.4k
    X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
1166
1167
    /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
1168
49.4k
    int64_t bestCost;
1169
49.4k
    if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
1170
0
    {
1171
0
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
1172
0
        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
1173
0
    }
1174
49.4k
    else
1175
49.4k
    {
1176
49.4k
        int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
1177
49.4k
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
1178
49.4k
        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
1179
49.4k
    }
1180
1181
    /* This loop starts with the last non-zero found in the first loop and then refines this last
1182
     * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
1183
     * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
1184
     * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
1185
     * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
1186
49.4k
    int  bestLastIdx = 0;
1187
49.4k
    bool foundLast = false;
1188
99.7k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
1189
50.3k
    {
1190
50.3k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1191
50.3k
        {
1192
            /* the presence of these coefficient groups are inferred, they have no bit in
1193
             * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
1194
50.3k
        }
1195
0
        else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
1196
0
        {
1197
            /* remove cost of significant coeff group flag, the group's presence would be inferred
1198
             * from lastNZ if it were present in this group */
1199
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1200
0
        }
1201
0
        else
1202
0
        {
1203
            /* remove cost of signaling this empty group as not present */
1204
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1205
0
            continue;
1206
0
        }
1207
1208
775k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
1209
757k
        {
1210
757k
            scanPos = cgScanPos * cgSize + scanPosinCG;
1211
757k
            if ((int)scanPos > lastScanPos)
1212
695k
                continue;
1213
1214
            /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
1215
             * continue as if it were uncoded. If the coefficient was already uncoded, remove the
1216
             * cost of signaling it as not-significant */
1217
61.4k
            uint32_t blkPos = codeParams.scan[scanPos];
1218
61.4k
            if (dstCoeff[blkPos])
1219
59.8k
            {
1220
                // Calculates the cost of signaling the last significant coefficient in the block 
1221
59.8k
                uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
1222
59.8k
                if (codeParams.scanType == SCAN_VER)
1223
3.90k
                    std::swap(pos[0], pos[1]);
1224
59.8k
                uint32_t bitsLastNZ = 0;
1225
1226
179k
                for (int i = 0; i < 2; i++)
1227
119k
                {
1228
119k
                    int temp = g_lastCoeffTable[pos[i]];
1229
119k
                    int prefixOnes = temp & 15;
1230
119k
                    int suffixLen = temp >> 4;
1231
1232
119k
                    bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
1233
119k
                    bitsLastNZ += IEP_RATE * suffixLen;
1234
119k
                }
1235
1236
59.8k
                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
1237
1238
59.8k
                if (costAsLast < bestCost)
1239
36.8k
                {
1240
36.8k
                    bestLastIdx = scanPos + 1;
1241
36.8k
                    bestCost = costAsLast;
1242
36.8k
                }
1243
59.8k
                if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
1244
31.7k
                {
1245
31.7k
                    foundLast = true;
1246
31.7k
                    break;
1247
31.7k
                }
1248
1249
28.0k
                totalRdCost -= costCoeff[scanPos];
1250
28.0k
                totalRdCost += costUncoded[blkPos];
1251
28.0k
            }
1252
1.64k
            else
1253
1.64k
                totalRdCost -= costSig[scanPos];
1254
61.4k
        }
1255
50.3k
    }
1256
1257
    /* recount non-zero coefficients and re-apply sign of DCT coef */
1258
49.4k
    numSig = 0;
1259
141k
    for (int pos = 0; pos < bestLastIdx; pos++)
1260
92.1k
    {
1261
92.1k
        int blkPos = codeParams.scan[pos];
1262
92.1k
        int level  = dstCoeff[blkPos];
1263
92.1k
        numSig += (level != 0);
1264
1265
92.1k
        uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
1266
92.1k
        dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
1267
92.1k
    }
1268
1269
    // Average 49.62 pixels
1270
    /* clean uncoded coefficients */
1271
49.4k
    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
1272
762k
    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
1273
712k
    {
1274
712k
        dstCoeff[codeParams.scan[pos]] = 0;
1275
712k
    }
1276
49.4k
    for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE)
1277
0
    {
1278
0
        const uint32_t blkPos = codeParams.scan[pos];
1279
0
        memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1280
0
        memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1281
0
        memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1282
0
        memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1283
0
    }
1284
1285
    /* rate-distortion based sign-hiding */
1286
50.3k
    if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
1287
4.60k
    {
1288
4.60k
        const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
1289
4.60k
        int lastCG = 1;
1290
1291
9.21k
        for (int subSet = realLastScanPos; subSet >= 0; subSet--)
1292
4.60k
        {
1293
4.60k
            int subPos = subSet << LOG2_SCAN_SET_SIZE;
1294
4.60k
            int n;
1295
1296
4.60k
            if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet])))
1297
0
                continue;
1298
1299
            /* measure distance between first and last non-zero coef in this
1300
             * coding group */
1301
4.60k
            const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
1302
4.60k
            const int firstNZPosInCG = (uint8_t)posFirstLast;
1303
4.60k
            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
1304
4.60k
            const uint32_t absSumSign = posFirstLast;
1305
1306
4.60k
            if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
1307
4.45k
            {
1308
4.45k
                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
1309
1310
#if CHECKED_BUILD || _DEBUG
1311
                int32_t absSum_dummy = 0;
1312
                for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
1313
                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
1314
                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
1315
#endif
1316
1317
                //if (signbit != absSumSign)
1318
4.45k
                if (((int32_t)(signbit ^ absSumSign)) < 0)
1319
2.08k
                {
1320
                    /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
1321
                     * is properly implied. Note dstCoeff[] are signed by this point but curChange and
1322
                     * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
1323
1324
2.08k
                    int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
1325
2.08k
                    uint32_t minPos = 0;
1326
2.08k
                    int8_t finalChange = 0;
1327
2.08k
                    int curChange = 0;
1328
2.08k
                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
1329
1330
29.1k
                    for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
1331
27.1k
                    {
1332
27.1k
                        const uint32_t blkPos = codeParams.scan[n + subPos];
1333
27.1k
                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
1334
27.1k
                        const int absLevel = abs(dstCoeff[blkPos]);
1335
                        // TODO: this is constant in non-scaling mode
1336
27.1k
                        const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
1337
27.1k
                        const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
1338
1339
27.1k
                        int d = abs(signCoef) - (unQuantLevel >> unquantShift);
1340
27.1k
                        X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
1341
1342
27.1k
                        const int64_t origDist = (((int64_t)d * d));
1343
1344
53.1k
#define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
1345
1346
27.1k
                        const uint32_t isOne = (absLevel == 1);
1347
27.1k
                        if (dstCoeff[blkPos])
1348
26.0k
                        {
1349
26.0k
                            d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
1350
26.0k
                            X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1351
26.0k
                            int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
1352
1353
                            /* if decrementing would make the coeff 0, we can include the
1354
                             * significant coeff flag cost savings */
1355
26.0k
                            d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
1356
26.0k
                            X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1357
26.0k
                            int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
1358
26.0k
                            int64_t costDown = DELTARDCOST(origDist, d, downBits);
1359
1360
26.0k
                            costDown -= lastCoeffAdjust;
1361
26.0k
                            curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
1362
1363
26.0k
                            curChange = 2 * (costUp < costDown) - 1;
1364
26.0k
                            curCost = (costUp < costDown) ? costUp : curCost;
1365
26.0k
                        }
1366
                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
1367
1.01k
                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
1368
0
                        {
1369
                            /* don't try to make a new coded coeff before the first coeff if its
1370
                             * sign would be different than the first coeff, the inferred sign would
1371
                             * still be wrong and we'd have to do this again. */
1372
0
                            curCost = MAX_INT64;
1373
0
                        }
1374
1.01k
                        else
1375
1.01k
                        {
1376
                            /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
1377
1.01k
                            d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
1378
1.01k
                            X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
1379
1.01k
                            curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
1380
1.01k
                            curChange = 1;
1381
1.01k
                        }
1382
1383
27.1k
                        if (curCost < minCostInc)
1384
7.37k
                        {
1385
7.37k
                            minCostInc = curCost;
1386
7.37k
                            finalChange = (int8_t)curChange;
1387
7.37k
                            minPos = blkPos + (absLevel << 16);
1388
7.37k
                        }
1389
27.1k
                        lastCoeffAdjust = 0;
1390
27.1k
                    }
1391
1392
2.08k
                    const int absInMinPos = (minPos >> 16);
1393
2.08k
                    minPos = (uint16_t)minPos;
1394
1395
                    // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
1396
2.08k
                    if (absInMinPos >= 32767)
1397
                        /* don't allow sign hiding to violate the SPEC range */
1398
0
                        finalChange = -1;
1399
1400
                    // NOTE: Reference code
1401
                    //if (dstCoeff[minPos] == 0)
1402
                    //    numSig++;
1403
                    //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
1404
                    //    numSig--;
1405
2.08k
                    numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
1406
1407
1408
                    // NOTE: Reference code
1409
                    //if (m_resiDctCoeff[minPos] >= 0)
1410
                    //    dstCoeff[minPos] += finalChange;
1411
                    //else
1412
                    //    dstCoeff[minPos] -= finalChange;
1413
2.08k
                    const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
1414
2.08k
                    dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
1415
2.08k
                }
1416
4.45k
            }
1417
1418
4.60k
            lastCG = 0;
1419
4.60k
        }
1420
4.60k
    }
1421
1422
49.4k
    return numSig;
1423
7.63M
}
unsigned int x265::Quant::rdoQuant<2u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool)
Line
Count
Source
611
6.32M
{
612
6.32M
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613
6.32M
    int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
614
6.32M
    const uint32_t usePsyMask = usePsy ? -1 : 0;
615
616
6.32M
    X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
617
618
6.32M
    int rem = m_qpParam[ttype].rem;
619
6.32M
    int per = m_qpParam[ttype].per;
620
6.32M
    int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
621
6.32M
    int add = (1 << (qbits - 1));
622
6.32M
    const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
623
624
6.32M
    const int numCoeff = 1 << (log2TrSize * 2);
625
6.32M
    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
626
6.32M
    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
627
6.32M
    if (!numSig)
628
6.30M
        return 0;
629
14.5k
    const uint32_t trSize = 1 << log2TrSize;
630
14.5k
    int64_t lambda2 = m_qpParam[ttype].lambda2;
631
14.5k
    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
632
    /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
633
     * scale applied that must be removed during unquant. Note that in real dequant there is clipping
634
     * at several stages. We skip the clipping for simplicity when measuring RD cost */
635
14.5k
    const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
636
14.5k
    int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
637
14.5k
    int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
638
14.5k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
639
640
14.5k
#define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
641
14.5k
#define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
642
14.5k
#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
643
14.5k
#define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
644
645
14.5k
    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
646
14.5k
    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
647
14.5k
    int64_t costSig[trSize * trSize];     /* lambda * bits       */
648
649
14.5k
    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
650
14.5k
    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
651
14.5k
    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
652
653
14.5k
    int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
654
14.5k
    uint64_t sigCoeffGroupFlag64 = 0;
655
656
14.5k
    const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
657
14.5k
    bool bIsLuma = ttype == TEXT_LUMA;
658
659
    /* total rate distortion cost of transform block, as CBF=0 */
660
14.5k
    int64_t totalUncodedCost = 0;
661
662
    /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
663
     * the distortion and signal cost of coded blocks, and the coding cost of significant
664
     * coefficient and coefficient group bitmaps */
665
14.5k
    int64_t totalRdCost = 0;
666
667
14.5k
    TUEntropyCodingParameters codeParams;
668
14.5k
    cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
669
14.5k
    const uint32_t log2TrSizeCG = log2TrSize - 2;
670
14.5k
    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
671
14.5k
    const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
672
673
14.5k
    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
674
14.5k
    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
675
14.5k
    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
676
677
#if CHECKED_BUILD || _DEBUG
678
    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
679
    memset(coeffNum, 0, sizeof(coeffNum) * sizeof(uint8_t));
680
    memset(coeffSign, 0, sizeof(coeffNum) * sizeof(uint16_t));
681
    memset(coeffFlag, 0, sizeof(coeffNum) * sizeof(uint16_t));
682
#endif
683
14.5k
    const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
684
14.5k
    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
685
686
687
    /* TODO: update bit estimates if dirty */
688
14.5k
    EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
689
690
14.5k
    uint32_t scanPos = 0;
691
14.5k
    uint32_t c1 = 1;
692
693
    // process trail all zero Coeff Group
694
695
    /* coefficients after lastNZ have no distortion signal cost */
696
14.5k
    const int zeroCG = cgNum - 1 - cgLastScanPos;
697
14.5k
    memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
698
14.5k
    memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
699
700
    /* sum zero coeff (uncodec) cost */
701
702
    // TODO: does we need these cost?
703
14.5k
    if (usePsyMask)
704
5.67k
    {
705
5.67k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
706
0
        {
707
0
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
708
0
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
709
0
            uint32_t blkPos      = codeParams.scan[scanPosBase];
710
0
#if X265_ARCH_X86
711
0
            bool enable512 = detect512();
712
0
            if (enable512)
713
0
                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
714
0
            else
715
0
            {
716
0
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
717
0
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
718
0
            }
719
#else
720
            primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
721
            primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
722
#endif
723
0
        }
724
5.67k
    }
725
8.89k
    else
726
8.89k
    {
727
        // non-psy path
728
8.89k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
729
0
        {
730
0
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
731
0
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
732
0
            uint32_t blkPos      = codeParams.scan[scanPosBase];
733
0
            primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
734
0
        }
735
8.89k
    }
736
14.5k
    static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
737
14.5k
    {
738
        // patternSigCtx = 0
739
14.5k
        {
740
14.5k
            2, 1, 1, 0,
741
14.5k
            1, 1, 0, 0,
742
14.5k
            1, 0, 0, 0,
743
14.5k
            0, 0, 0, 0,
744
14.5k
        },
745
        // patternSigCtx = 1
746
14.5k
        {
747
14.5k
            2, 2, 2, 2,
748
14.5k
            1, 1, 1, 1,
749
14.5k
            0, 0, 0, 0,
750
14.5k
            0, 0, 0, 0,
751
14.5k
        },
752
        // patternSigCtx = 2
753
14.5k
        {
754
14.5k
            2, 1, 0, 0,
755
14.5k
            2, 1, 0, 0,
756
14.5k
            2, 1, 0, 0,
757
14.5k
            2, 1, 0, 0,
758
14.5k
        },
759
        // patternSigCtx = 3
760
14.5k
        {
761
14.5k
            2, 2, 2, 2,
762
14.5k
            2, 2, 2, 2,
763
14.5k
            2, 2, 2, 2,
764
14.5k
            2, 2, 2, 2,
765
14.5k
        },
766
        // 4x4
767
14.5k
        {
768
14.5k
            0, 1, 4, 5,
769
14.5k
            2, 3, 4, 5,
770
14.5k
            6, 6, 8, 8,
771
14.5k
            7, 7, 8, 8
772
14.5k
        }
773
14.5k
    };
774
775
    /* iterate over coding groups in reverse scan order */
776
29.9k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
777
15.3k
    {
778
15.3k
        uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
779
15.3k
        const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
780
15.3k
        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
781
15.3k
        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
782
15.3k
        const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
783
15.3k
        const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
784
15.3k
        const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
785
786
15.3k
        if (c1 == 0)
787
0
            ctxSet++;
788
15.3k
        c1 = 1;
789
790
15.3k
        if (cgScanPos && (coeffNum[cgScanPos] == 0))
791
0
        {
792
            // TODO: does we need zero-coeff cost?
793
0
            const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
794
0
            uint32_t blkPos = codeParams.scan[scanPosBase];
795
0
            if (usePsyMask)
796
0
            {
797
0
#if X265_ARCH_X86
798
0
                bool enable512 = detect512();
799
0
                if (enable512)
800
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
801
0
                else
802
0
                {
803
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
804
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
805
0
                }
806
#else
807
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
808
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
809
#endif
810
0
                blkPos = codeParams.scan[scanPosBase];
811
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
812
0
                {
813
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
814
0
                    {
815
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
816
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
817
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
818
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
819
820
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
821
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
822
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
823
0
                    }
824
0
                    blkPos += trSize;
825
0
                }
826
0
            }
827
0
            else
828
0
            {
829
                // non-psy path
830
0
                primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
831
0
                blkPos = codeParams.scan[scanPosBase];
832
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
833
0
                {
834
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
835
0
                    {
836
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
837
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
838
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
839
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
840
841
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
842
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
843
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
844
0
                    }
845
0
                    blkPos += trSize;
846
0
                }
847
0
            }
848
849
            /* there were no coded coefficients in this coefficient group */
850
0
            {
851
0
                uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
852
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
853
0
                totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
854
0
            }
855
0
            continue;
856
0
        }
857
858
15.3k
        coeffGroupRDStats cgRdStats;
859
15.3k
        memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
860
861
15.3k
        uint32_t subFlagMask = coeffFlag[cgScanPos];
862
15.3k
        int    c2            = 0;
863
15.3k
        uint32_t goRiceParam = 0;
864
15.3k
        uint32_t levelThreshold = 3;
865
15.3k
        uint32_t c1Idx       = 0;
866
15.3k
        uint32_t c2Idx       = 0;
867
        /* iterate over coefficients in each group in reverse scan order */
868
260k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
869
245k
        {
870
245k
            scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
871
245k
            uint32_t blkPos      = codeParams.scan[scanPos];
872
245k
            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
873
245k
            int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
874
245k
            int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
875
876
            /* RDOQ measures distortion as the squared difference between the unquantized coded level
877
             * and the original DCT coefficient. The result is shifted scaleBits to account for the
878
             * FIX15 nature of the CABAC cost tables minus the forward transform scale */
879
880
            /* cost of not coding this coefficient (all distortion, no signal bits) */
881
245k
            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
882
245k
            X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
883
245k
            if (usePsyMask & scanPos)
884
                /* when no residual coefficient is coded, predicted coef == recon coef */
885
85.1k
                costUncoded[blkPos] -= PSYVALUE(predictedCoef);
886
887
245k
            totalUncodedCost += costUncoded[blkPos];
888
889
            // coefficient level estimation
890
245k
            const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
891
            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
892
245k
            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
893
245k
            uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
894
245k
            const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
895
            // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
896
245k
            X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
897
898
            // before find lastest non-zero coeff
899
245k
            if (scanPos > (uint32_t)lastScanPos)
900
171k
            {
901
                /* coefficients after lastNZ have no distortion signal cost */
902
171k
                costCoeff[scanPos] = 0;
903
171k
                costSig[scanPos] = 0;
904
905
                /* No non-zero coefficient yet found, but this does not mean
906
                 * there is no uncoded-cost for this coefficient. Pre-
907
                 * quantization the coefficient may have been non-zero */
908
171k
                totalRdCost += costUncoded[blkPos];
909
171k
            }
910
74.2k
            else if (!(subFlagMask & 1))
911
1.80k
            {
912
                // fast zero coeff path
913
                /* set default costs to uncoded costs */
914
1.80k
                costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
915
1.80k
                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
916
1.80k
                sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
917
1.80k
                totalRdCost += costCoeff[scanPos];
918
1.80k
                rateIncUp[blkPos] = greaterOneBits[0];
919
920
1.80k
                subFlagMask >>= 1;
921
1.80k
            }
922
72.4k
            else
923
72.4k
            {
924
72.4k
                subFlagMask >>= 1;
925
926
72.4k
                const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
927
72.4k
                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
928
929
72.4k
                X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
930
72.4k
                X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
931
72.4k
                X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
932
72.4k
                X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
933
934
                // coefficient level estimation
935
72.4k
                const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
936
72.4k
                const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
937
938
72.4k
                uint32_t level = 0;
939
72.4k
                uint32_t sigCoefBits = 0;
940
72.4k
                costCoeff[scanPos] = MAX_INT64;
941
942
72.4k
                if ((int)scanPos == lastScanPos)
943
15.3k
                    sigRateDelta[blkPos] = 0;
944
57.1k
                else
945
57.1k
                {
946
57.1k
                    if (maxAbsLevel < 3)
947
14.0k
                    {
948
                        /* set default costs to uncoded costs */
949
14.0k
                        costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
950
14.0k
                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
951
14.0k
                    }
952
57.1k
                    sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
953
57.1k
                    sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
954
57.1k
                }
955
956
72.4k
                const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
957
                // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
958
72.4k
                if (maxAbsLevel == 1)
959
12.5k
                {
960
12.5k
                    uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
961
12.5k
                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
962
963
12.5k
                    int unquantAbsLevel = unQuantLevel >> unquantShift;
964
12.5k
                    X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
965
12.5k
                    int d = abs(signCoef) - unquantAbsLevel;
966
12.5k
                    int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
967
968
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
969
12.5k
                    if (usePsyMask & scanPos)
970
11.0k
                    {
971
11.0k
                        int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
972
11.0k
                        curCost -= PSYVALUE(reconCoef);
973
11.0k
                    }
974
975
12.5k
                    if (curCost < costCoeff[scanPos])
976
11.8k
                    {
977
11.8k
                        level = 1;
978
11.8k
                        costCoeff[scanPos] = curCost;
979
11.8k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
980
11.8k
                    }
981
12.5k
                }
982
59.8k
                else if (maxAbsLevel)
983
59.8k
                {
984
59.8k
                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
985
59.8k
                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
986
987
59.8k
                    const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
988
989
59.8k
                    const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
990
59.8k
                    X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
991
59.8k
                    int d0 = abs(signCoef) - unquantAbsLevel0;
992
59.8k
                    int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
993
994
59.8k
                    const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
995
59.8k
                    X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
996
59.8k
                    int d1 = abs(signCoef) - unquantAbsLevel1;
997
59.8k
                    int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
998
999
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
1000
59.8k
                    if (usePsyMask & scanPos)
1001
46.1k
                    {
1002
46.1k
                        int reconCoef;
1003
46.1k
                        reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef));
1004
46.1k
                        curCost0 -= PSYVALUE(reconCoef);
1005
1006
46.1k
                        reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef));
1007
46.1k
                        curCost1 -= PSYVALUE(reconCoef);
1008
46.1k
                    }
1009
59.8k
                    if (curCost0 < costCoeff[scanPos])
1010
59.8k
                    {
1011
59.8k
                        level = maxAbsLevel;
1012
59.8k
                        costCoeff[scanPos] = curCost0;
1013
59.8k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1014
59.8k
                    }
1015
59.8k
                    if (curCost1 < costCoeff[scanPos])
1016
1.45k
                    {
1017
1.45k
                        level = maxAbsLevel - 1;
1018
1.45k
                        costCoeff[scanPos] = curCost1;
1019
1.45k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1020
1.45k
                    }
1021
59.8k
                }
1022
1023
72.4k
                dstCoeff[blkPos] = (int16_t)level;
1024
72.4k
                totalRdCost += costCoeff[scanPos];
1025
1026
                /* record costs for sign-hiding performed at the end */
1027
72.4k
                if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level)
1028
71.7k
                {
1029
71.7k
                    const int32_t diff0 = level - 1 - baseLevel;
1030
71.7k
                    const int32_t diff2 = level + 1 - baseLevel;
1031
71.7k
                    const int32_t maxVlc = g_goRiceRange[goRiceParam];
1032
71.7k
                    int rate0, rate1, rate2;
1033
1034
71.7k
                    if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
1035
11.1k
                    {
1036
                        // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
1037
                        //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
1038
11.1k
                        X265_CHECK(level == 1, "absLevel check failure\n");
1039
1040
11.1k
                        const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
1041
11.1k
                        const int rateNotEqual2 = greaterOneBits[0];
1042
1043
11.1k
                        rate0 = 0;
1044
11.1k
                        rate2 = rateEqual2;
1045
11.1k
                        rate1 = rateNotEqual2;
1046
1047
11.1k
                        X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1048
11.1k
                        X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1049
11.1k
                        X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1050
11.1k
                    }
1051
60.5k
                    else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
1052
31.6k
                    {
1053
                        // NOTE: no c1c2 correct rate since all of rate include this factor
1054
31.6k
                        rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
1055
31.6k
                        rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
1056
31.6k
                        rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
1057
31.6k
                    }
1058
28.9k
                    else
1059
28.9k
                    {
1060
28.9k
                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1061
28.9k
                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1062
28.9k
                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1063
28.9k
                    }
1064
71.7k
                    rateIncUp[blkPos] = rate2 - rate1;
1065
71.7k
                    rateIncDown[blkPos] = rate0 - rate1;
1066
71.7k
                }
1067
716
                else
1068
716
                {
1069
716
                    rateIncUp[blkPos] = greaterOneBits[0];
1070
716
                    rateIncDown[blkPos] = 0;
1071
716
                }
1072
1073
                /* Update CABAC estimation state */
1074
72.4k
                if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
1075
24.4k
                {
1076
24.4k
                    goRiceParam++;
1077
24.4k
                    levelThreshold <<= 1;
1078
24.4k
                }
1079
1080
72.4k
                const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
1081
72.4k
                c1Idx += isNonZero;
1082
1083
                /* update bin model */
1084
72.4k
                if (level > 1)
1085
58.9k
                {
1086
58.9k
                    c1 = 0;
1087
58.9k
                    c2 += (uint32_t)(c2 - 2) >> 31;
1088
58.9k
                    c2Idx++;
1089
58.9k
                }
1090
13.4k
                else if (((c1 == 1) | (c1 == 2)) & isNonZero)
1091
7.85k
                    c1++;
1092
1093
72.4k
                if (dstCoeff[blkPos])
1094
71.7k
                {
1095
71.7k
                    sigCoeffGroupFlag64 |= cgBlkPosMask;
1096
71.7k
                    cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
1097
71.7k
                    cgRdStats.uncodedDist += costUncoded[blkPos];
1098
71.7k
                    cgRdStats.nnzBeforePos0 += scanPosinCG;
1099
71.7k
                }
1100
72.4k
            }
1101
1102
245k
            cgRdStats.sigCost += costSig[scanPos];
1103
245k
        } /* end for (scanPosinCG) */
1104
1105
15.3k
        X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n");
1106
15.3k
        cgRdStats.sigCost0 = costSig[scanPos];
1107
1108
15.3k
        costCoeffGroupSig[cgScanPos] = 0;
1109
1110
        /* nothing to do at this case */
1111
15.3k
        X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n");
1112
1113
15.3k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1114
15.3k
        {
1115
            /* coeff group 0 is implied to be present, no signal cost */
1116
            /* coeff group with last NZ is implied to be present, handled below */
1117
15.3k
        }
1118
0
        else if (sigCoeffGroupFlag64 & cgBlkPosMask)
1119
0
        {
1120
0
            if (!cgRdStats.nnzBeforePos0)
1121
0
            {
1122
                /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
1123
0
                totalRdCost -= cgRdStats.sigCost0;
1124
0
                cgRdStats.sigCost -= cgRdStats.sigCost0;
1125
0
            }
1126
1127
            /* there are coded coefficients in this group, but now we include the signaling cost
1128
             * of the significant coefficient group flag and evaluate whether the RD cost of the
1129
             * coded group is more than the RD cost of the uncoded group */
1130
1131
0
            uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1132
1133
0
            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1134
0
            costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
1135
0
            costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
1136
0
            costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
1137
1138
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
1139
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
1140
1141
0
            if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
1142
0
            {
1143
0
                sigCoeffGroupFlag64 &= ~cgBlkPosMask;
1144
0
                totalRdCost = costZeroCG;
1145
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1146
1147
                /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
1148
0
                const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
1149
0
                memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1150
0
                memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1151
0
                memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1152
0
                memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1153
0
            }
1154
0
        }
1155
0
        else
1156
0
        {
1157
            /* there were no coded coefficients in this coefficient group */
1158
0
            uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1159
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
1160
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
1161
0
            totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
1162
0
        }
1163
15.3k
    } /* end for (cgScanPos) */
1164
1165
14.5k
    X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
1166
1167
    /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
1168
14.5k
    int64_t bestCost;
1169
14.5k
    if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
1170
0
    {
1171
0
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
1172
0
        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
1173
0
    }
1174
14.5k
    else
1175
14.5k
    {
1176
14.5k
        int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
1177
14.5k
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
1178
14.5k
        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
1179
14.5k
    }
1180
1181
    /* This loop starts with the last non-zero found in the first loop and then refines this last
1182
     * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
1183
     * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
1184
     * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
1185
     * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
1186
14.5k
    int  bestLastIdx = 0;
1187
14.5k
    bool foundLast = false;
1188
29.9k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
1189
15.3k
    {
1190
15.3k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1191
15.3k
        {
1192
            /* the presence of these coefficient groups are inferred, they have no bit in
1193
             * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
1194
15.3k
        }
1195
0
        else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
1196
0
        {
1197
            /* remove cost of significant coeff group flag, the group's presence would be inferred
1198
             * from lastNZ if it were present in this group */
1199
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1200
0
        }
1201
0
        else
1202
0
        {
1203
            /* remove cost of signaling this empty group as not present */
1204
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1205
0
            continue;
1206
0
        }
1207
1208
199k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
1209
197k
        {
1210
197k
            scanPos = cgScanPos * cgSize + scanPosinCG;
1211
197k
            if ((int)scanPos > lastScanPos)
1212
171k
                continue;
1213
1214
            /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
1215
             * continue as if it were uncoded. If the coefficient was already uncoded, remove the
1216
             * cost of signaling it as not-significant */
1217
26.4k
            uint32_t blkPos = codeParams.scan[scanPos];
1218
26.4k
            if (dstCoeff[blkPos])
1219
24.8k
            {
1220
                // Calculates the cost of signaling the last significant coefficient in the block 
1221
24.8k
                uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
1222
24.8k
                if (codeParams.scanType == SCAN_VER)
1223
3.57k
                    std::swap(pos[0], pos[1]);
1224
24.8k
                uint32_t bitsLastNZ = 0;
1225
1226
74.4k
                for (int i = 0; i < 2; i++)
1227
49.6k
                {
1228
49.6k
                    int temp = g_lastCoeffTable[pos[i]];
1229
49.6k
                    int prefixOnes = temp & 15;
1230
49.6k
                    int suffixLen = temp >> 4;
1231
1232
49.6k
                    bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
1233
49.6k
                    bitsLastNZ += IEP_RATE * suffixLen;
1234
49.6k
                }
1235
1236
24.8k
                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
1237
1238
24.8k
                if (costAsLast < bestCost)
1239
16.0k
                {
1240
16.0k
                    bestLastIdx = scanPos + 1;
1241
16.0k
                    bestCost = costAsLast;
1242
16.0k
                }
1243
24.8k
                if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
1244
13.6k
                {
1245
13.6k
                    foundLast = true;
1246
13.6k
                    break;
1247
13.6k
                }
1248
1249
11.1k
                totalRdCost -= costCoeff[scanPos];
1250
11.1k
                totalRdCost += costUncoded[blkPos];
1251
11.1k
            }
1252
1.66k
            else
1253
1.66k
                totalRdCost -= costSig[scanPos];
1254
26.4k
        }
1255
15.3k
    }
1256
1257
    /* recount non-zero coefficients and re-apply sign of DCT coef */
1258
14.5k
    numSig = 0;
1259
85.9k
    for (int pos = 0; pos < bestLastIdx; pos++)
1260
71.3k
    {
1261
71.3k
        int blkPos = codeParams.scan[pos];
1262
71.3k
        int level  = dstCoeff[blkPos];
1263
71.3k
        numSig += (level != 0);
1264
1265
71.3k
        uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
1266
71.3k
        dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
1267
71.3k
    }
1268
1269
    // Average 49.62 pixels
1270
    /* clean uncoded coefficients */
1271
14.5k
    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
1272
188k
    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
1273
174k
    {
1274
174k
        dstCoeff[codeParams.scan[pos]] = 0;
1275
174k
    }
1276
14.5k
    for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE)
1277
0
    {
1278
0
        const uint32_t blkPos = codeParams.scan[pos];
1279
0
        memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1280
0
        memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1281
0
        memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1282
0
        memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1283
0
    }
1284
1285
    /* rate-distortion based sign-hiding */
1286
15.3k
    if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
1287
4.60k
    {
1288
4.60k
        const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
1289
4.60k
        int lastCG = 1;
1290
1291
9.21k
        for (int subSet = realLastScanPos; subSet >= 0; subSet--)
1292
4.60k
        {
1293
4.60k
            int subPos = subSet << LOG2_SCAN_SET_SIZE;
1294
4.60k
            int n;
1295
1296
4.60k
            if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet])))
1297
0
                continue;
1298
1299
            /* measure distance between first and last non-zero coef in this
1300
             * coding group */
1301
4.60k
            const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
1302
4.60k
            const int firstNZPosInCG = (uint8_t)posFirstLast;
1303
4.60k
            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
1304
4.60k
            const uint32_t absSumSign = posFirstLast;
1305
1306
4.60k
            if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
1307
4.45k
            {
1308
4.45k
                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
1309
1310
#if CHECKED_BUILD || _DEBUG
1311
                int32_t absSum_dummy = 0;
1312
                for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
1313
                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
1314
                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
1315
#endif
1316
1317
                //if (signbit != absSumSign)
1318
4.45k
                if (((int32_t)(signbit ^ absSumSign)) < 0)
1319
2.08k
                {
1320
                    /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
1321
                     * is properly implied. Note dstCoeff[] are signed by this point but curChange and
1322
                     * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
1323
1324
2.08k
                    int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
1325
2.08k
                    uint32_t minPos = 0;
1326
2.08k
                    int8_t finalChange = 0;
1327
2.08k
                    int curChange = 0;
1328
2.08k
                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
1329
1330
29.1k
                    for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
1331
27.1k
                    {
1332
27.1k
                        const uint32_t blkPos = codeParams.scan[n + subPos];
1333
27.1k
                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
1334
27.1k
                        const int absLevel = abs(dstCoeff[blkPos]);
1335
                        // TODO: this is constant in non-scaling mode
1336
27.1k
                        const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
1337
27.1k
                        const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
1338
1339
27.1k
                        int d = abs(signCoef) - (unQuantLevel >> unquantShift);
1340
27.1k
                        X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
1341
1342
27.1k
                        const int64_t origDist = (((int64_t)d * d));
1343
1344
27.1k
#define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
1345
1346
27.1k
                        const uint32_t isOne = (absLevel == 1);
1347
27.1k
                        if (dstCoeff[blkPos])
1348
26.0k
                        {
1349
26.0k
                            d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
1350
26.0k
                            X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1351
26.0k
                            int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
1352
1353
                            /* if decrementing would make the coeff 0, we can include the
1354
                             * significant coeff flag cost savings */
1355
26.0k
                            d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
1356
26.0k
                            X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1357
26.0k
                            int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
1358
26.0k
                            int64_t costDown = DELTARDCOST(origDist, d, downBits);
1359
1360
26.0k
                            costDown -= lastCoeffAdjust;
1361
26.0k
                            curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
1362
1363
26.0k
                            curChange = 2 * (costUp < costDown) - 1;
1364
26.0k
                            curCost = (costUp < costDown) ? costUp : curCost;
1365
26.0k
                        }
1366
                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
1367
1.01k
                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
1368
0
                        {
1369
                            /* don't try to make a new coded coeff before the first coeff if its
1370
                             * sign would be different than the first coeff, the inferred sign would
1371
                             * still be wrong and we'd have to do this again. */
1372
0
                            curCost = MAX_INT64;
1373
0
                        }
1374
1.01k
                        else
1375
1.01k
                        {
1376
                            /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
1377
1.01k
                            d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
1378
1.01k
                            X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
1379
1.01k
                            curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
1380
1.01k
                            curChange = 1;
1381
1.01k
                        }
1382
1383
27.1k
                        if (curCost < minCostInc)
1384
7.37k
                        {
1385
7.37k
                            minCostInc = curCost;
1386
7.37k
                            finalChange = (int8_t)curChange;
1387
7.37k
                            minPos = blkPos + (absLevel << 16);
1388
7.37k
                        }
1389
27.1k
                        lastCoeffAdjust = 0;
1390
27.1k
                    }
1391
1392
2.08k
                    const int absInMinPos = (minPos >> 16);
1393
2.08k
                    minPos = (uint16_t)minPos;
1394
1395
                    // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
1396
2.08k
                    if (absInMinPos >= 32767)
1397
                        /* don't allow sign hiding to violate the SPEC range */
1398
0
                        finalChange = -1;
1399
1400
                    // NOTE: Reference code
1401
                    //if (dstCoeff[minPos] == 0)
1402
                    //    numSig++;
1403
                    //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
1404
                    //    numSig--;
1405
2.08k
                    numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
1406
1407
1408
                    // NOTE: Reference code
1409
                    //if (m_resiDctCoeff[minPos] >= 0)
1410
                    //    dstCoeff[minPos] += finalChange;
1411
                    //else
1412
                    //    dstCoeff[minPos] -= finalChange;
1413
2.08k
                    const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
1414
2.08k
                    dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
1415
2.08k
                }
1416
4.45k
            }
1417
1418
4.60k
            lastCG = 0;
1419
4.60k
        }
1420
4.60k
    }
1421
1422
14.5k
    return numSig;
1423
6.32M
}
unsigned int x265::Quant::rdoQuant<3u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool)
Line
Count
Source
611
1.06M
{
612
1.06M
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613
1.06M
    int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
614
1.06M
    const uint32_t usePsyMask = usePsy ? -1 : 0;
615
616
1.06M
    X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
617
618
1.06M
    int rem = m_qpParam[ttype].rem;
619
1.06M
    int per = m_qpParam[ttype].per;
620
1.06M
    int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
621
1.06M
    int add = (1 << (qbits - 1));
622
1.06M
    const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
623
624
1.06M
    const int numCoeff = 1 << (log2TrSize * 2);
625
1.06M
    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
626
1.06M
    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
627
1.06M
    if (!numSig)
628
1.04M
        return 0;
629
23.2k
    const uint32_t trSize = 1 << log2TrSize;
630
23.2k
    int64_t lambda2 = m_qpParam[ttype].lambda2;
631
23.2k
    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
632
    /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
633
     * scale applied that must be removed during unquant. Note that in real dequant there is clipping
634
     * at several stages. We skip the clipping for simplicity when measuring RD cost */
635
23.2k
    const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
636
23.2k
    int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
637
23.2k
    int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
638
23.2k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
639
640
23.2k
#define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
641
23.2k
#define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
642
23.2k
#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
643
23.2k
#define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
644
645
23.2k
    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
646
23.2k
    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
647
23.2k
    int64_t costSig[trSize * trSize];     /* lambda * bits       */
648
649
23.2k
    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
650
23.2k
    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
651
23.2k
    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
652
653
23.2k
    int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
654
23.2k
    uint64_t sigCoeffGroupFlag64 = 0;
655
656
23.2k
    const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
657
23.2k
    bool bIsLuma = ttype == TEXT_LUMA;
658
659
    /* total rate distortion cost of transform block, as CBF=0 */
660
23.2k
    int64_t totalUncodedCost = 0;
661
662
    /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
663
     * the distortion and signal cost of coded blocks, and the coding cost of significant
664
     * coefficient and coefficient group bitmaps */
665
23.2k
    int64_t totalRdCost = 0;
666
667
23.2k
    TUEntropyCodingParameters codeParams;
668
23.2k
    cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
669
23.2k
    const uint32_t log2TrSizeCG = log2TrSize - 2;
670
23.2k
    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
671
23.2k
    const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
672
673
23.2k
    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
674
23.2k
    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
675
23.2k
    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
676
677
#if CHECKED_BUILD || _DEBUG
678
    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
679
    memset(coeffNum, 0, sizeof(coeffNum) * sizeof(uint8_t));
680
    memset(coeffSign, 0, sizeof(coeffNum) * sizeof(uint16_t));
681
    memset(coeffFlag, 0, sizeof(coeffNum) * sizeof(uint16_t));
682
#endif
683
23.2k
    const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
684
23.2k
    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
685
686
687
    /* TODO: update bit estimates if dirty */
688
23.2k
    EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
689
690
23.2k
    uint32_t scanPos = 0;
691
23.2k
    uint32_t c1 = 1;
692
693
    // process trail all zero Coeff Group
694
695
    /* coefficients after lastNZ have no distortion signal cost */
696
23.2k
    const int zeroCG = cgNum - 1 - cgLastScanPos;
697
23.2k
    memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
698
23.2k
    memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
699
700
    /* sum zero coeff (uncodec) cost */
701
702
    // TODO: does we need these cost?
703
23.2k
    if (usePsyMask)
704
4.82k
    {
705
19.2k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
706
14.4k
        {
707
14.4k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
708
14.4k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
709
14.4k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
710
14.4k
#if X265_ARCH_X86
711
14.4k
            bool enable512 = detect512();
712
14.4k
            if (enable512)
713
0
                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
714
14.4k
            else
715
14.4k
            {
716
14.4k
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
717
14.4k
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
718
14.4k
            }
719
#else
720
            primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
721
            primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
722
#endif
723
14.4k
        }
724
4.82k
    }
725
18.4k
    else
726
18.4k
    {
727
        // non-psy path
728
74.0k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
729
55.5k
        {
730
55.5k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
731
55.5k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
732
55.5k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
733
55.5k
            primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
734
55.5k
        }
735
18.4k
    }
736
23.2k
    static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
737
23.2k
    {
738
        // patternSigCtx = 0
739
23.2k
        {
740
23.2k
            2, 1, 1, 0,
741
23.2k
            1, 1, 0, 0,
742
23.2k
            1, 0, 0, 0,
743
23.2k
            0, 0, 0, 0,
744
23.2k
        },
745
        // patternSigCtx = 1
746
23.2k
        {
747
23.2k
            2, 2, 2, 2,
748
23.2k
            1, 1, 1, 1,
749
23.2k
            0, 0, 0, 0,
750
23.2k
            0, 0, 0, 0,
751
23.2k
        },
752
        // patternSigCtx = 2
753
23.2k
        {
754
23.2k
            2, 1, 0, 0,
755
23.2k
            2, 1, 0, 0,
756
23.2k
            2, 1, 0, 0,
757
23.2k
            2, 1, 0, 0,
758
23.2k
        },
759
        // patternSigCtx = 3
760
23.2k
        {
761
23.2k
            2, 2, 2, 2,
762
23.2k
            2, 2, 2, 2,
763
23.2k
            2, 2, 2, 2,
764
23.2k
            2, 2, 2, 2,
765
23.2k
        },
766
        // 4x4
767
23.2k
        {
768
23.2k
            0, 1, 4, 5,
769
23.2k
            2, 3, 4, 5,
770
23.2k
            6, 6, 8, 8,
771
23.2k
            7, 7, 8, 8
772
23.2k
        }
773
23.2k
    };
774
775
    /* iterate over coding groups in reverse scan order */
776
46.5k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
777
23.3k
    {
778
23.3k
        uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
779
23.3k
        const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
780
23.3k
        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
781
23.3k
        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
782
23.3k
        const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
783
23.3k
        const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
784
23.3k
        const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
785
786
23.3k
        if (c1 == 0)
787
0
            ctxSet++;
788
23.3k
        c1 = 1;
789
790
23.3k
        if (cgScanPos && (coeffNum[cgScanPos] == 0))
791
0
        {
792
            // TODO: does we need zero-coeff cost?
793
0
            const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
794
0
            uint32_t blkPos = codeParams.scan[scanPosBase];
795
0
            if (usePsyMask)
796
0
            {
797
0
#if X265_ARCH_X86
798
0
                bool enable512 = detect512();
799
0
                if (enable512)
800
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
801
0
                else
802
0
                {
803
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
804
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
805
0
                }
806
#else
807
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
808
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
809
#endif
810
0
                blkPos = codeParams.scan[scanPosBase];
811
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
812
0
                {
813
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
814
0
                    {
815
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
816
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
817
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
818
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
819
820
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
821
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
822
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
823
0
                    }
824
0
                    blkPos += trSize;
825
0
                }
826
0
            }
827
0
            else
828
0
            {
829
                // non-psy path
830
0
                primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
831
0
                blkPos = codeParams.scan[scanPosBase];
832
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
833
0
                {
834
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
835
0
                    {
836
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
837
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
838
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
839
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
840
841
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
842
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
843
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
844
0
                    }
845
0
                    blkPos += trSize;
846
0
                }
847
0
            }
848
849
            /* there were no coded coefficients in this coefficient group */
850
0
            {
851
0
                uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
852
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
853
0
                totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
854
0
            }
855
0
            continue;
856
0
        }
857
858
23.3k
        coeffGroupRDStats cgRdStats;
859
23.3k
        memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
860
861
23.3k
        uint32_t subFlagMask = coeffFlag[cgScanPos];
862
23.3k
        int    c2            = 0;
863
23.3k
        uint32_t goRiceParam = 0;
864
23.3k
        uint32_t levelThreshold = 3;
865
23.3k
        uint32_t c1Idx       = 0;
866
23.3k
        uint32_t c2Idx       = 0;
867
        /* iterate over coefficients in each group in reverse scan order */
868
396k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
869
373k
        {
870
373k
            scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
871
373k
            uint32_t blkPos      = codeParams.scan[scanPos];
872
373k
            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
873
373k
            int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
874
373k
            int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
875
876
            /* RDOQ measures distortion as the squared difference between the unquantized coded level
877
             * and the original DCT coefficient. The result is shifted scaleBits to account for the
878
             * FIX15 nature of the CABAC cost tables minus the forward transform scale */
879
880
            /* cost of not coding this coefficient (all distortion, no signal bits) */
881
373k
            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
882
373k
            X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
883
373k
            if (usePsyMask & scanPos)
884
                /* when no residual coefficient is coded, predicted coef == recon coef */
885
72.3k
                costUncoded[blkPos] -= PSYVALUE(predictedCoef);
886
887
373k
            totalUncodedCost += costUncoded[blkPos];
888
889
            // coefficient level estimation
890
373k
            const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
891
            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
892
373k
            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
893
373k
            uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
894
373k
            const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
895
            // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
896
373k
            X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
897
898
            // before find lastest non-zero coeff
899
373k
            if (scanPos > (uint32_t)lastScanPos)
900
350k
            {
901
                /* coefficients after lastNZ have no distortion signal cost */
902
350k
                costCoeff[scanPos] = 0;
903
350k
                costSig[scanPos] = 0;
904
905
                /* No non-zero coefficient yet found, but this does not mean
906
                 * there is no uncoded-cost for this coefficient. Pre-
907
                 * quantization the coefficient may have been non-zero */
908
350k
                totalRdCost += costUncoded[blkPos];
909
350k
            }
910
23.2k
            else if (!(subFlagMask & 1))
911
0
            {
912
                // fast zero coeff path
913
                /* set default costs to uncoded costs */
914
0
                costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
915
0
                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
916
0
                sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
917
0
                totalRdCost += costCoeff[scanPos];
918
0
                rateIncUp[blkPos] = greaterOneBits[0];
919
920
0
                subFlagMask >>= 1;
921
0
            }
922
23.2k
            else
923
23.2k
            {
924
23.2k
                subFlagMask >>= 1;
925
926
23.2k
                const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
927
23.2k
                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
928
929
23.2k
                X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
930
23.2k
                X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
931
23.2k
                X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
932
23.2k
                X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
933
934
                // coefficient level estimation
935
23.2k
                const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
936
18.4E
                const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
937
938
23.2k
                uint32_t level = 0;
939
23.2k
                uint32_t sigCoefBits = 0;
940
23.2k
                costCoeff[scanPos] = MAX_INT64;
941
942
23.2k
                if ((int)scanPos == lastScanPos)
943
23.3k
                    sigRateDelta[blkPos] = 0;
944
18.4E
                else
945
18.4E
                {
946
18.4E
                    if (maxAbsLevel < 3)
947
0
                    {
948
                        /* set default costs to uncoded costs */
949
0
                        costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
950
0
                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
951
0
                    }
952
18.4E
                    sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
953
18.4E
                    sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
954
18.4E
                }
955
956
23.2k
                const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
957
                // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
958
23.2k
                if (maxAbsLevel == 1)
959
14.7k
                {
960
14.7k
                    uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
961
14.7k
                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
962
963
14.7k
                    int unquantAbsLevel = unQuantLevel >> unquantShift;
964
14.7k
                    X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
965
14.7k
                    int d = abs(signCoef) - unquantAbsLevel;
966
14.7k
                    int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
967
968
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
969
14.7k
                    if (usePsyMask & scanPos)
970
0
                    {
971
0
                        int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
972
0
                        curCost -= PSYVALUE(reconCoef);
973
0
                    }
974
975
14.7k
                    if (curCost < costCoeff[scanPos])
976
14.7k
                    {
977
14.7k
                        level = 1;
978
14.7k
                        costCoeff[scanPos] = curCost;
979
14.7k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
980
14.7k
                    }
981
14.7k
                }
982
8.51k
                else if (maxAbsLevel)
983
8.64k
                {
984
8.64k
                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
985
8.64k
                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
986
987
8.64k
                    const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
988
989
8.64k
                    const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
990
8.64k
                    X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
991
8.64k
                    int d0 = abs(signCoef) - unquantAbsLevel0;
992
8.64k
                    int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
993
994
8.64k
                    const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
995
8.64k
                    X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
996
8.64k
                    int d1 = abs(signCoef) - unquantAbsLevel1;
997
8.64k
                    int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
998
999
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
1000
8.64k
                    if (usePsyMask & scanPos)
1001
0
                    {
1002
0
                        int reconCoef;
1003
0
                        reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef));
1004
0
                        curCost0 -= PSYVALUE(reconCoef);
1005
1006
0
                        reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef));
1007
0
                        curCost1 -= PSYVALUE(reconCoef);
1008
0
                    }
1009
8.64k
                    if (curCost0 < costCoeff[scanPos])
1010
8.64k
                    {
1011
8.64k
                        level = maxAbsLevel;
1012
8.64k
                        costCoeff[scanPos] = curCost0;
1013
8.64k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1014
8.64k
                    }
1015
8.64k
                    if (curCost1 < costCoeff[scanPos])
1016
0
                    {
1017
0
                        level = maxAbsLevel - 1;
1018
0
                        costCoeff[scanPos] = curCost1;
1019
0
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1020
0
                    }
1021
8.64k
                }
1022
1023
23.2k
                dstCoeff[blkPos] = (int16_t)level;
1024
23.2k
                totalRdCost += costCoeff[scanPos];
1025
1026
                /* record costs for sign-hiding performed at the end */
1027
18.4E
                if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level)
1028
23.3k
                {
1029
23.3k
                    const int32_t diff0 = level - 1 - baseLevel;
1030
23.3k
                    const int32_t diff2 = level + 1 - baseLevel;
1031
23.3k
                    const int32_t maxVlc = g_goRiceRange[goRiceParam];
1032
23.3k
                    int rate0, rate1, rate2;
1033
1034
23.3k
                    if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
1035
14.7k
                    {
1036
                        // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
1037
                        //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
1038
14.7k
                        X265_CHECK(level == 1, "absLevel check failure\n");
1039
1040
14.7k
                        const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
1041
14.7k
                        const int rateNotEqual2 = greaterOneBits[0];
1042
1043
14.7k
                        rate0 = 0;
1044
14.7k
                        rate2 = rateEqual2;
1045
14.7k
                        rate1 = rateNotEqual2;
1046
1047
14.7k
                        X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1048
14.7k
                        X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1049
14.7k
                        X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1050
14.7k
                    }
1051
8.64k
                    else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
1052
0
                    {
1053
                        // NOTE: no c1c2 correct rate since all of rate include this factor
1054
0
                        rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
1055
0
                        rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
1056
0
                        rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
1057
0
                    }
1058
8.64k
                    else
1059
8.64k
                    {
1060
8.64k
                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1061
8.64k
                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1062
8.64k
                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1063
8.64k
                    }
1064
23.3k
                    rateIncUp[blkPos] = rate2 - rate1;
1065
23.3k
                    rateIncDown[blkPos] = rate0 - rate1;
1066
23.3k
                }
1067
18.4E
                else
1068
18.4E
                {
1069
18.4E
                    rateIncUp[blkPos] = greaterOneBits[0];
1070
18.4E
                    rateIncDown[blkPos] = 0;
1071
18.4E
                }
1072
1073
                /* Update CABAC estimation state */
1074
23.2k
                if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
1075
8.64k
                {
1076
8.64k
                    goRiceParam++;
1077
8.64k
                    levelThreshold <<= 1;
1078
8.64k
                }
1079
1080
23.2k
                const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
1081
23.2k
                c1Idx += isNonZero;
1082
1083
                /* update bin model */
1084
23.2k
                if (level > 1)
1085
8.64k
                {
1086
8.64k
                    c1 = 0;
1087
8.64k
                    c2 += (uint32_t)(c2 - 2) >> 31;
1088
8.64k
                    c2Idx++;
1089
8.64k
                }
1090
14.5k
                else if (((c1 == 1) | (c1 == 2)) & isNonZero)
1091
14.7k
                    c1++;
1092
1093
23.2k
                if (dstCoeff[blkPos])
1094
23.3k
                {
1095
23.3k
                    sigCoeffGroupFlag64 |= cgBlkPosMask;
1096
23.3k
                    cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
1097
23.3k
                    cgRdStats.uncodedDist += costUncoded[blkPos];
1098
23.3k
                    cgRdStats.nnzBeforePos0 += scanPosinCG;
1099
23.3k
                }
1100
23.2k
            }
1101
1102
373k
            cgRdStats.sigCost += costSig[scanPos];
1103
373k
        } /* end for (scanPosinCG) */
1104
1105
23.3k
        X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n");
1106
23.3k
        cgRdStats.sigCost0 = costSig[scanPos];
1107
1108
23.3k
        costCoeffGroupSig[cgScanPos] = 0;
1109
1110
        /* nothing to do at this case */
1111
23.3k
        X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n");
1112
1113
23.3k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1114
23.3k
        {
1115
            /* coeff group 0 is implied to be present, no signal cost */
1116
            /* coeff group with last NZ is implied to be present, handled below */
1117
23.3k
        }
1118
0
        else if (sigCoeffGroupFlag64 & cgBlkPosMask)
1119
0
        {
1120
0
            if (!cgRdStats.nnzBeforePos0)
1121
0
            {
1122
                /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
1123
0
                totalRdCost -= cgRdStats.sigCost0;
1124
0
                cgRdStats.sigCost -= cgRdStats.sigCost0;
1125
0
            }
1126
1127
            /* there are coded coefficients in this group, but now we include the signaling cost
1128
             * of the significant coefficient group flag and evaluate whether the RD cost of the
1129
             * coded group is more than the RD cost of the uncoded group */
1130
1131
0
            uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1132
1133
0
            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1134
0
            costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
1135
0
            costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
1136
0
            costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
1137
1138
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
1139
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
1140
1141
0
            if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
1142
0
            {
1143
0
                sigCoeffGroupFlag64 &= ~cgBlkPosMask;
1144
0
                totalRdCost = costZeroCG;
1145
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1146
1147
                /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
1148
0
                const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
1149
0
                memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1150
0
                memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1151
0
                memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1152
0
                memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1153
0
            }
1154
0
        }
1155
0
        else
1156
0
        {
1157
            /* there were no coded coefficients in this coefficient group */
1158
0
            uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1159
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
1160
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
1161
0
            totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
1162
0
        }
1163
23.3k
    } /* end for (cgScanPos) */
1164
1165
23.2k
    X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
1166
1167
    /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
1168
23.2k
    int64_t bestCost;
1169
23.2k
    if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
1170
0
    {
1171
0
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
1172
0
        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
1173
0
    }
1174
23.2k
    else
1175
23.2k
    {
1176
23.2k
        int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
1177
23.2k
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
1178
23.2k
        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
1179
23.2k
    }
1180
1181
    /* This loop starts with the last non-zero found in the first loop and then refines this last
1182
     * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
1183
     * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
1184
     * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
1185
     * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
1186
23.2k
    int  bestLastIdx = 0;
1187
23.2k
    bool foundLast = false;
1188
46.5k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
1189
23.3k
    {
1190
23.3k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1191
23.3k
        {
1192
            /* the presence of these coefficient groups are inferred, they have no bit in
1193
             * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
1194
23.3k
        }
1195
0
        else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
1196
0
        {
1197
            /* remove cost of significant coeff group flag, the group's presence would be inferred
1198
             * from lastNZ if it were present in this group */
1199
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1200
0
        }
1201
0
        else
1202
0
        {
1203
            /* remove cost of signaling this empty group as not present */
1204
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1205
0
            continue;
1206
0
        }
1207
1208
388k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
1209
373k
        {
1210
373k
            scanPos = cgScanPos * cgSize + scanPosinCG;
1211
373k
            if ((int)scanPos > lastScanPos)
1212
350k
                continue;
1213
1214
            /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
1215
             * continue as if it were uncoded. If the coefficient was already uncoded, remove the
1216
             * cost of signaling it as not-significant */
1217
23.3k
            uint32_t blkPos = codeParams.scan[scanPos];
1218
23.3k
            if (dstCoeff[blkPos])
1219
23.3k
            {
1220
                // Calculates the cost of signaling the last significant coefficient in the block 
1221
23.3k
                uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
1222
23.3k
                if (codeParams.scanType == SCAN_VER)
1223
330
                    std::swap(pos[0], pos[1]);
1224
23.3k
                uint32_t bitsLastNZ = 0;
1225
1226
70.0k
                for (int i = 0; i < 2; i++)
1227
46.7k
                {
1228
46.7k
                    int temp = g_lastCoeffTable[pos[i]];
1229
46.7k
                    int prefixOnes = temp & 15;
1230
46.7k
                    int suffixLen = temp >> 4;
1231
1232
46.7k
                    bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
1233
46.7k
                    bitsLastNZ += IEP_RATE * suffixLen;
1234
46.7k
                }
1235
1236
23.3k
                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
1237
1238
23.3k
                if (costAsLast < bestCost)
1239
9.80k
                {
1240
9.80k
                    bestLastIdx = scanPos + 1;
1241
9.80k
                    bestCost = costAsLast;
1242
9.80k
                }
1243
23.3k
                if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
1244
8.64k
                {
1245
8.64k
                    foundLast = true;
1246
8.64k
                    break;
1247
8.64k
                }
1248
1249
14.7k
                totalRdCost -= costCoeff[scanPos];
1250
14.7k
                totalRdCost += costUncoded[blkPos];
1251
14.7k
            }
1252
18.4E
            else
1253
18.4E
                totalRdCost -= costSig[scanPos];
1254
23.3k
        }
1255
23.3k
    }
1256
1257
    /* recount non-zero coefficients and re-apply sign of DCT coef */
1258
23.2k
    numSig = 0;
1259
33.0k
    for (int pos = 0; pos < bestLastIdx; pos++)
1260
9.80k
    {
1261
9.80k
        int blkPos = codeParams.scan[pos];
1262
9.80k
        int level  = dstCoeff[blkPos];
1263
9.80k
        numSig += (level != 0);
1264
1265
9.80k
        uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
1266
9.80k
        dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
1267
9.80k
    }
1268
1269
    // Average 49.62 pixels
1270
    /* clean uncoded coefficients */
1271
23.2k
    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
1272
386k
    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
1273
363k
    {
1274
363k
        dstCoeff[codeParams.scan[pos]] = 0;
1275
363k
    }
1276
23.2k
    for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE)
1277
0
    {
1278
0
        const uint32_t blkPos = codeParams.scan[pos];
1279
0
        memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1280
0
        memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1281
0
        memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1282
0
        memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1283
0
    }
1284
1285
    /* rate-distortion based sign-hiding */
1286
23.3k
    if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
1287
0
    {
1288
0
        const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
1289
0
        int lastCG = 1;
1290
1291
0
        for (int subSet = realLastScanPos; subSet >= 0; subSet--)
1292
0
        {
1293
0
            int subPos = subSet << LOG2_SCAN_SET_SIZE;
1294
0
            int n;
1295
1296
0
            if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet])))
1297
0
                continue;
1298
1299
            /* measure distance between first and last non-zero coef in this
1300
             * coding group */
1301
0
            const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
1302
0
            const int firstNZPosInCG = (uint8_t)posFirstLast;
1303
0
            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
1304
0
            const uint32_t absSumSign = posFirstLast;
1305
1306
0
            if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
1307
0
            {
1308
0
                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
1309
1310
#if CHECKED_BUILD || _DEBUG
1311
                int32_t absSum_dummy = 0;
1312
                for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
1313
                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
1314
                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
1315
#endif
1316
1317
                //if (signbit != absSumSign)
1318
0
                if (((int32_t)(signbit ^ absSumSign)) < 0)
1319
0
                {
1320
                    /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
1321
                     * is properly implied. Note dstCoeff[] are signed by this point but curChange and
1322
                     * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
1323
1324
0
                    int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
1325
0
                    uint32_t minPos = 0;
1326
0
                    int8_t finalChange = 0;
1327
0
                    int curChange = 0;
1328
0
                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
1329
1330
0
                    for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
1331
0
                    {
1332
0
                        const uint32_t blkPos = codeParams.scan[n + subPos];
1333
0
                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
1334
0
                        const int absLevel = abs(dstCoeff[blkPos]);
1335
                        // TODO: this is constant in non-scaling mode
1336
0
                        const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
1337
0
                        const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
1338
1339
0
                        int d = abs(signCoef) - (unQuantLevel >> unquantShift);
1340
0
                        X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
1341
1342
0
                        const int64_t origDist = (((int64_t)d * d));
1343
1344
0
#define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
1345
1346
0
                        const uint32_t isOne = (absLevel == 1);
1347
0
                        if (dstCoeff[blkPos])
1348
0
                        {
1349
0
                            d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
1350
0
                            X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1351
0
                            int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
1352
1353
                            /* if decrementing would make the coeff 0, we can include the
1354
                             * significant coeff flag cost savings */
1355
0
                            d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
1356
0
                            X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1357
0
                            int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
1358
0
                            int64_t costDown = DELTARDCOST(origDist, d, downBits);
1359
1360
0
                            costDown -= lastCoeffAdjust;
1361
0
                            curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
1362
1363
0
                            curChange = 2 * (costUp < costDown) - 1;
1364
0
                            curCost = (costUp < costDown) ? costUp : curCost;
1365
0
                        }
1366
                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
1367
0
                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
1368
0
                        {
1369
                            /* don't try to make a new coded coeff before the first coeff if its
1370
                             * sign would be different than the first coeff, the inferred sign would
1371
                             * still be wrong and we'd have to do this again. */
1372
0
                            curCost = MAX_INT64;
1373
0
                        }
1374
0
                        else
1375
0
                        {
1376
                            /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
1377
0
                            d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
1378
0
                            X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
1379
0
                            curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
1380
0
                            curChange = 1;
1381
0
                        }
1382
1383
0
                        if (curCost < minCostInc)
1384
0
                        {
1385
0
                            minCostInc = curCost;
1386
0
                            finalChange = (int8_t)curChange;
1387
0
                            minPos = blkPos + (absLevel << 16);
1388
0
                        }
1389
0
                        lastCoeffAdjust = 0;
1390
0
                    }
1391
1392
0
                    const int absInMinPos = (minPos >> 16);
1393
0
                    minPos = (uint16_t)minPos;
1394
1395
                    // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
1396
0
                    if (absInMinPos >= 32767)
1397
                        /* don't allow sign hiding to violate the SPEC range */
1398
0
                        finalChange = -1;
1399
1400
                    // NOTE: Reference code
1401
                    //if (dstCoeff[minPos] == 0)
1402
                    //    numSig++;
1403
                    //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
1404
                    //    numSig--;
1405
0
                    numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
1406
1407
1408
                    // NOTE: Reference code
1409
                    //if (m_resiDctCoeff[minPos] >= 0)
1410
                    //    dstCoeff[minPos] += finalChange;
1411
                    //else
1412
                    //    dstCoeff[minPos] -= finalChange;
1413
0
                    const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
1414
0
                    dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
1415
0
                }
1416
0
            }
1417
1418
0
            lastCG = 0;
1419
0
        }
1420
0
    }
1421
1422
23.2k
    return numSig;
1423
1.06M
}
unsigned int x265::Quant::rdoQuant<4u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool)
Line
Count
Source
611
232k
{
612
232k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613
232k
    int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
614
232k
    const uint32_t usePsyMask = usePsy ? -1 : 0;
615
616
232k
    X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
617
618
232k
    int rem = m_qpParam[ttype].rem;
619
232k
    int per = m_qpParam[ttype].per;
620
232k
    int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
621
232k
    int add = (1 << (qbits - 1));
622
232k
    const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
623
624
232k
    const int numCoeff = 1 << (log2TrSize * 2);
625
232k
    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
626
232k
    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
627
232k
    if (!numSig)
628
223k
        return 0;
629
9.28k
    const uint32_t trSize = 1 << log2TrSize;
630
9.28k
    int64_t lambda2 = m_qpParam[ttype].lambda2;
631
9.28k
    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
632
    /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
633
     * scale applied that must be removed during unquant. Note that in real dequant there is clipping
634
     * at several stages. We skip the clipping for simplicity when measuring RD cost */
635
9.28k
    const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
636
9.28k
    int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
637
9.28k
    int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
638
9.28k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
639
640
9.28k
#define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
641
9.28k
#define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
642
9.28k
#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
643
9.28k
#define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
644
645
9.28k
    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
646
9.28k
    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
647
9.28k
    int64_t costSig[trSize * trSize];     /* lambda * bits       */
648
649
9.28k
    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
650
9.28k
    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
651
9.28k
    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
652
653
9.28k
    int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
654
9.28k
    uint64_t sigCoeffGroupFlag64 = 0;
655
656
9.28k
    const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
657
9.28k
    bool bIsLuma = ttype == TEXT_LUMA;
658
659
    /* total rate distortion cost of transform block, as CBF=0 */
660
9.28k
    int64_t totalUncodedCost = 0;
661
662
    /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
663
     * the distortion and signal cost of coded blocks, and the coding cost of significant
664
     * coefficient and coefficient group bitmaps */
665
9.28k
    int64_t totalRdCost = 0;
666
667
9.28k
    TUEntropyCodingParameters codeParams;
668
9.28k
    cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
669
9.28k
    const uint32_t log2TrSizeCG = log2TrSize - 2;
670
9.28k
    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
671
9.28k
    const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
672
673
9.28k
    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
674
9.28k
    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
675
9.28k
    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
676
677
#if CHECKED_BUILD || _DEBUG
678
    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
679
    memset(coeffNum, 0, sizeof(coeffNum) * sizeof(uint8_t));
680
    memset(coeffSign, 0, sizeof(coeffNum) * sizeof(uint16_t));
681
    memset(coeffFlag, 0, sizeof(coeffNum) * sizeof(uint16_t));
682
#endif
683
9.28k
    const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
684
9.28k
    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
685
686
687
    /* TODO: update bit estimates if dirty */
688
9.28k
    EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
689
690
9.28k
    uint32_t scanPos = 0;
691
9.28k
    uint32_t c1 = 1;
692
693
    // process trail all zero Coeff Group
694
695
    /* coefficients after lastNZ have no distortion signal cost */
696
9.28k
    const int zeroCG = cgNum - 1 - cgLastScanPos;
697
9.28k
    memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
698
9.28k
    memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
699
700
    /* sum zero coeff (uncodec) cost */
701
702
    // TODO: does we need these cost?
703
9.28k
    if (usePsyMask)
704
5.73k
    {
705
91.7k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
706
86.0k
        {
707
86.0k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
708
86.0k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
709
86.0k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
710
86.0k
#if X265_ARCH_X86
711
86.0k
            bool enable512 = detect512();
712
86.0k
            if (enable512)
713
0
                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
714
86.0k
            else
715
86.0k
            {
716
86.0k
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
717
86.0k
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
718
86.0k
            }
719
#else
720
            primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
721
            primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
722
#endif
723
86.0k
        }
724
5.73k
    }
725
3.54k
    else
726
3.54k
    {
727
        // non-psy path
728
57.2k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
729
53.6k
        {
730
53.6k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
731
53.6k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
732
53.6k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
733
53.6k
            primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
734
53.6k
        }
735
3.54k
    }
736
9.28k
    static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
737
9.28k
    {
738
        // patternSigCtx = 0
739
9.28k
        {
740
9.28k
            2, 1, 1, 0,
741
9.28k
            1, 1, 0, 0,
742
9.28k
            1, 0, 0, 0,
743
9.28k
            0, 0, 0, 0,
744
9.28k
        },
745
        // patternSigCtx = 1
746
9.28k
        {
747
9.28k
            2, 2, 2, 2,
748
9.28k
            1, 1, 1, 1,
749
9.28k
            0, 0, 0, 0,
750
9.28k
            0, 0, 0, 0,
751
9.28k
        },
752
        // patternSigCtx = 2
753
9.28k
        {
754
9.28k
            2, 1, 0, 0,
755
9.28k
            2, 1, 0, 0,
756
9.28k
            2, 1, 0, 0,
757
9.28k
            2, 1, 0, 0,
758
9.28k
        },
759
        // patternSigCtx = 3
760
9.28k
        {
761
9.28k
            2, 2, 2, 2,
762
9.28k
            2, 2, 2, 2,
763
9.28k
            2, 2, 2, 2,
764
9.28k
            2, 2, 2, 2,
765
9.28k
        },
766
        // 4x4
767
9.28k
        {
768
9.28k
            0, 1, 4, 5,
769
9.28k
            2, 3, 4, 5,
770
9.28k
            6, 6, 8, 8,
771
9.28k
            7, 7, 8, 8
772
9.28k
        }
773
9.28k
    };
774
775
    /* iterate over coding groups in reverse scan order */
776
18.5k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
777
9.31k
    {
778
9.31k
        uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
779
9.31k
        const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
780
9.31k
        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
781
9.31k
        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
782
9.31k
        const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
783
9.31k
        const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
784
9.31k
        const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
785
786
9.31k
        if (c1 == 0)
787
0
            ctxSet++;
788
9.31k
        c1 = 1;
789
790
9.31k
        if (cgScanPos && (coeffNum[cgScanPos] == 0))
791
0
        {
792
            // TODO: does we need zero-coeff cost?
793
0
            const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
794
0
            uint32_t blkPos = codeParams.scan[scanPosBase];
795
0
            if (usePsyMask)
796
0
            {
797
0
#if X265_ARCH_X86
798
0
                bool enable512 = detect512();
799
0
                if (enable512)
800
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
801
0
                else
802
0
                {
803
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
804
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
805
0
                }
806
#else
807
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
808
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
809
#endif
810
0
                blkPos = codeParams.scan[scanPosBase];
811
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
812
0
                {
813
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
814
0
                    {
815
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
816
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
817
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
818
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
819
820
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
821
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
822
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
823
0
                    }
824
0
                    blkPos += trSize;
825
0
                }
826
0
            }
827
0
            else
828
0
            {
829
                // non-psy path
830
0
                primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
831
0
                blkPos = codeParams.scan[scanPosBase];
832
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
833
0
                {
834
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
835
0
                    {
836
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
837
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
838
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
839
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
840
841
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
842
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
843
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
844
0
                    }
845
0
                    blkPos += trSize;
846
0
                }
847
0
            }
848
849
            /* there were no coded coefficients in this coefficient group */
850
0
            {
851
0
                uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
852
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
853
0
                totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
854
0
            }
855
0
            continue;
856
0
        }
857
858
9.31k
        coeffGroupRDStats cgRdStats;
859
9.31k
        memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
860
861
9.31k
        uint32_t subFlagMask = coeffFlag[cgScanPos];
862
9.31k
        int    c2            = 0;
863
9.31k
        uint32_t goRiceParam = 0;
864
9.31k
        uint32_t levelThreshold = 3;
865
9.31k
        uint32_t c1Idx       = 0;
866
9.31k
        uint32_t c2Idx       = 0;
867
        /* iterate over coefficients in each group in reverse scan order */
868
158k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
869
149k
        {
870
149k
            scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
871
149k
            uint32_t blkPos      = codeParams.scan[scanPos];
872
149k
            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
873
149k
            int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
874
149k
            int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
875
876
            /* RDOQ measures distortion as the squared difference between the unquantized coded level
877
             * and the original DCT coefficient. The result is shifted scaleBits to account for the
878
             * FIX15 nature of the CABAC cost tables minus the forward transform scale */
879
880
            /* cost of not coding this coefficient (all distortion, no signal bits) */
881
149k
            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
882
149k
            X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
883
149k
            if (usePsyMask & scanPos)
884
                /* when no residual coefficient is coded, predicted coef == recon coef */
885
86.0k
                costUncoded[blkPos] -= PSYVALUE(predictedCoef);
886
887
149k
            totalUncodedCost += costUncoded[blkPos];
888
889
            // coefficient level estimation
890
149k
            const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
891
            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
892
149k
            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
893
149k
            uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
894
149k
            const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
895
            // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
896
149k
            X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
897
898
            // before find lastest non-zero coeff
899
149k
            if (scanPos > (uint32_t)lastScanPos)
900
139k
            {
901
                /* coefficients after lastNZ have no distortion signal cost */
902
139k
                costCoeff[scanPos] = 0;
903
139k
                costSig[scanPos] = 0;
904
905
                /* No non-zero coefficient yet found, but this does not mean
906
                 * there is no uncoded-cost for this coefficient. Pre-
907
                 * quantization the coefficient may have been non-zero */
908
139k
                totalRdCost += costUncoded[blkPos];
909
139k
            }
910
9.31k
            else if (!(subFlagMask & 1))
911
0
            {
912
                // fast zero coeff path
913
                /* set default costs to uncoded costs */
914
0
                costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
915
0
                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
916
0
                sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
917
0
                totalRdCost += costCoeff[scanPos];
918
0
                rateIncUp[blkPos] = greaterOneBits[0];
919
920
0
                subFlagMask >>= 1;
921
0
            }
922
9.31k
            else
923
9.31k
            {
924
9.31k
                subFlagMask >>= 1;
925
926
9.31k
                const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
927
9.31k
                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
928
929
9.31k
                X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
930
9.31k
                X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
931
9.31k
                X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
932
9.31k
                X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
933
934
                // coefficient level estimation
935
9.31k
                const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
936
9.31k
                const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
937
938
9.31k
                uint32_t level = 0;
939
9.31k
                uint32_t sigCoefBits = 0;
940
9.31k
                costCoeff[scanPos] = MAX_INT64;
941
942
9.31k
                if ((int)scanPos == lastScanPos)
943
9.31k
                    sigRateDelta[blkPos] = 0;
944
0
                else
945
0
                {
946
0
                    if (maxAbsLevel < 3)
947
0
                    {
948
                        /* set default costs to uncoded costs */
949
0
                        costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
950
0
                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
951
0
                    }
952
0
                    sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
953
0
                    sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
954
0
                }
955
956
9.31k
                const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
957
                // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
958
9.31k
                if (maxAbsLevel == 1)
959
2.16k
                {
960
2.16k
                    uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
961
2.16k
                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
962
963
2.16k
                    int unquantAbsLevel = unQuantLevel >> unquantShift;
964
2.16k
                    X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
965
2.16k
                    int d = abs(signCoef) - unquantAbsLevel;
966
2.16k
                    int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
967
968
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
969
2.16k
                    if (usePsyMask & scanPos)
970
0
                    {
971
0
                        int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
972
0
                        curCost -= PSYVALUE(reconCoef);
973
0
                    }
974
975
2.16k
                    if (curCost < costCoeff[scanPos])
976
2.16k
                    {
977
2.16k
                        level = 1;
978
2.16k
                        costCoeff[scanPos] = curCost;
979
2.16k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
980
2.16k
                    }
981
2.16k
                }
982
7.15k
                else if (maxAbsLevel)
983
7.15k
                {
984
7.15k
                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
985
7.15k
                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
986
987
7.15k
                    const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
988
989
7.15k
                    const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
990
7.15k
                    X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
991
7.15k
                    int d0 = abs(signCoef) - unquantAbsLevel0;
992
7.15k
                    int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
993
994
7.15k
                    const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
995
7.15k
                    X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
996
7.15k
                    int d1 = abs(signCoef) - unquantAbsLevel1;
997
7.15k
                    int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
998
999
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
1000
7.15k
                    if (usePsyMask & scanPos)
1001
0
                    {
1002
0
                        int reconCoef;
1003
0
                        reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef));
1004
0
                        curCost0 -= PSYVALUE(reconCoef);
1005
1006
0
                        reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef));
1007
0
                        curCost1 -= PSYVALUE(reconCoef);
1008
0
                    }
1009
7.15k
                    if (curCost0 < costCoeff[scanPos])
1010
7.15k
                    {
1011
7.15k
                        level = maxAbsLevel;
1012
7.15k
                        costCoeff[scanPos] = curCost0;
1013
7.15k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1014
7.15k
                    }
1015
7.15k
                    if (curCost1 < costCoeff[scanPos])
1016
63
                    {
1017
63
                        level = maxAbsLevel - 1;
1018
63
                        costCoeff[scanPos] = curCost1;
1019
63
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1020
63
                    }
1021
7.15k
                }
1022
1023
9.31k
                dstCoeff[blkPos] = (int16_t)level;
1024
9.31k
                totalRdCost += costCoeff[scanPos];
1025
1026
                /* record costs for sign-hiding performed at the end */
1027
9.31k
                if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level)
1028
9.31k
                {
1029
9.31k
                    const int32_t diff0 = level - 1 - baseLevel;
1030
9.31k
                    const int32_t diff2 = level + 1 - baseLevel;
1031
9.31k
                    const int32_t maxVlc = g_goRiceRange[goRiceParam];
1032
9.31k
                    int rate0, rate1, rate2;
1033
1034
9.31k
                    if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
1035
2.16k
                    {
1036
                        // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
1037
                        //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
1038
2.16k
                        X265_CHECK(level == 1, "absLevel check failure\n");
1039
1040
2.16k
                        const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
1041
2.16k
                        const int rateNotEqual2 = greaterOneBits[0];
1042
1043
2.16k
                        rate0 = 0;
1044
2.16k
                        rate2 = rateEqual2;
1045
2.16k
                        rate1 = rateNotEqual2;
1046
1047
2.16k
                        X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1048
2.16k
                        X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1049
2.16k
                        X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1050
2.16k
                    }
1051
7.15k
                    else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
1052
0
                    {
1053
                        // NOTE: no c1c2 correct rate since all of rate include this factor
1054
0
                        rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
1055
0
                        rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
1056
0
                        rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
1057
0
                    }
1058
7.15k
                    else
1059
7.15k
                    {
1060
7.15k
                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1061
7.15k
                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1062
7.15k
                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1063
7.15k
                    }
1064
9.31k
                    rateIncUp[blkPos] = rate2 - rate1;
1065
9.31k
                    rateIncDown[blkPos] = rate0 - rate1;
1066
9.31k
                }
1067
0
                else
1068
0
                {
1069
0
                    rateIncUp[blkPos] = greaterOneBits[0];
1070
0
                    rateIncDown[blkPos] = 0;
1071
0
                }
1072
1073
                /* Update CABAC estimation state */
1074
9.31k
                if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
1075
7.15k
                {
1076
7.15k
                    goRiceParam++;
1077
7.15k
                    levelThreshold <<= 1;
1078
7.15k
                }
1079
1080
9.31k
                const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
1081
9.31k
                c1Idx += isNonZero;
1082
1083
                /* update bin model */
1084
9.31k
                if (level > 1)
1085
7.15k
                {
1086
7.15k
                    c1 = 0;
1087
7.15k
                    c2 += (uint32_t)(c2 - 2) >> 31;
1088
7.15k
                    c2Idx++;
1089
7.15k
                }
1090
2.16k
                else if (((c1 == 1) | (c1 == 2)) & isNonZero)
1091
2.16k
                    c1++;
1092
1093
9.31k
                if (dstCoeff[blkPos])
1094
9.31k
                {
1095
9.31k
                    sigCoeffGroupFlag64 |= cgBlkPosMask;
1096
9.31k
                    cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
1097
9.31k
                    cgRdStats.uncodedDist += costUncoded[blkPos];
1098
9.31k
                    cgRdStats.nnzBeforePos0 += scanPosinCG;
1099
9.31k
                }
1100
9.31k
            }
1101
1102
149k
            cgRdStats.sigCost += costSig[scanPos];
1103
149k
        } /* end for (scanPosinCG) */
1104
1105
9.31k
        X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n");
1106
9.31k
        cgRdStats.sigCost0 = costSig[scanPos];
1107
1108
9.31k
        costCoeffGroupSig[cgScanPos] = 0;
1109
1110
        /* nothing to do at this case */
1111
9.31k
        X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n");
1112
1113
9.31k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1114
9.31k
        {
1115
            /* coeff group 0 is implied to be present, no signal cost */
1116
            /* coeff group with last NZ is implied to be present, handled below */
1117
9.31k
        }
1118
0
        else if (sigCoeffGroupFlag64 & cgBlkPosMask)
1119
0
        {
1120
0
            if (!cgRdStats.nnzBeforePos0)
1121
0
            {
1122
                /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
1123
0
                totalRdCost -= cgRdStats.sigCost0;
1124
0
                cgRdStats.sigCost -= cgRdStats.sigCost0;
1125
0
            }
1126
1127
            /* there are coded coefficients in this group, but now we include the signaling cost
1128
             * of the significant coefficient group flag and evaluate whether the RD cost of the
1129
             * coded group is more than the RD cost of the uncoded group */
1130
1131
0
            uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1132
1133
0
            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1134
0
            costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
1135
0
            costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
1136
0
            costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
1137
1138
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
1139
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
1140
1141
0
            if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
1142
0
            {
1143
0
                sigCoeffGroupFlag64 &= ~cgBlkPosMask;
1144
0
                totalRdCost = costZeroCG;
1145
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1146
1147
                /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
1148
0
                const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
1149
0
                memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1150
0
                memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1151
0
                memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1152
0
                memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1153
0
            }
1154
0
        }
1155
0
        else
1156
0
        {
1157
            /* there were no coded coefficients in this coefficient group */
1158
0
            uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1159
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
1160
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
1161
0
            totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
1162
0
        }
1163
9.31k
    } /* end for (cgScanPos) */
1164
1165
9.28k
    X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
1166
1167
    /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
1168
9.28k
    int64_t bestCost;
1169
9.28k
    if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
1170
0
    {
1171
0
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
1172
0
        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
1173
0
    }
1174
9.28k
    else
1175
9.28k
    {
1176
9.28k
        int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
1177
9.28k
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
1178
9.28k
        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
1179
9.28k
    }
1180
1181
    /* This loop starts with the last non-zero found in the first loop and then refines this last
1182
     * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
1183
     * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
1184
     * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
1185
     * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
1186
9.28k
    int  bestLastIdx = 0;
1187
9.28k
    bool foundLast = false;
1188
18.5k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
1189
9.31k
    {
1190
9.31k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1191
9.31k
        {
1192
            /* the presence of these coefficient groups are inferred, they have no bit in
1193
             * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
1194
9.31k
        }
1195
0
        else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
1196
0
        {
1197
            /* remove cost of significant coeff group flag, the group's presence would be inferred
1198
             * from lastNZ if it were present in this group */
1199
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1200
0
        }
1201
0
        else
1202
0
        {
1203
            /* remove cost of signaling this empty group as not present */
1204
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1205
0
            continue;
1206
0
        }
1207
1208
151k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
1209
149k
        {
1210
149k
            scanPos = cgScanPos * cgSize + scanPosinCG;
1211
149k
            if ((int)scanPos > lastScanPos)
1212
139k
                continue;
1213
1214
            /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
1215
             * continue as if it were uncoded. If the coefficient was already uncoded, remove the
1216
             * cost of signaling it as not-significant */
1217
9.31k
            uint32_t blkPos = codeParams.scan[scanPos];
1218
9.31k
            if (dstCoeff[blkPos])
1219
9.31k
            {
1220
                // Calculates the cost of signaling the last significant coefficient in the block 
1221
9.31k
                uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
1222
9.31k
                if (codeParams.scanType == SCAN_VER)
1223
0
                    std::swap(pos[0], pos[1]);
1224
9.31k
                uint32_t bitsLastNZ = 0;
1225
1226
27.9k
                for (int i = 0; i < 2; i++)
1227
18.6k
                {
1228
18.6k
                    int temp = g_lastCoeffTable[pos[i]];
1229
18.6k
                    int prefixOnes = temp & 15;
1230
18.6k
                    int suffixLen = temp >> 4;
1231
1232
18.6k
                    bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
1233
18.6k
                    bitsLastNZ += IEP_RATE * suffixLen;
1234
18.6k
                }
1235
1236
9.31k
                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
1237
1238
9.31k
                if (costAsLast < bestCost)
1239
8.65k
                {
1240
8.65k
                    bestLastIdx = scanPos + 1;
1241
8.65k
                    bestCost = costAsLast;
1242
8.65k
                }
1243
9.31k
                if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
1244
7.15k
                {
1245
7.15k
                    foundLast = true;
1246
7.15k
                    break;
1247
7.15k
                }
1248
1249
2.16k
                totalRdCost -= costCoeff[scanPos];
1250
2.16k
                totalRdCost += costUncoded[blkPos];
1251
2.16k
            }
1252
0
            else
1253
0
                totalRdCost -= costSig[scanPos];
1254
9.31k
        }
1255
9.31k
    }
1256
1257
    /* recount non-zero coefficients and re-apply sign of DCT coef */
1258
9.28k
    numSig = 0;
1259
17.9k
    for (int pos = 0; pos < bestLastIdx; pos++)
1260
8.65k
    {
1261
8.65k
        int blkPos = codeParams.scan[pos];
1262
8.65k
        int level  = dstCoeff[blkPos];
1263
8.65k
        numSig += (level != 0);
1264
1265
8.65k
        uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
1266
8.65k
        dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
1267
8.65k
    }
1268
1269
    // Average 49.62 pixels
1270
    /* clean uncoded coefficients */
1271
9.28k
    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
1272
149k
    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
1273
140k
    {
1274
140k
        dstCoeff[codeParams.scan[pos]] = 0;
1275
140k
    }
1276
9.28k
    for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE)
1277
0
    {
1278
0
        const uint32_t blkPos = codeParams.scan[pos];
1279
0
        memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1280
0
        memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1281
0
        memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1282
0
        memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1283
0
    }
1284
1285
    /* rate-distortion based sign-hiding */
1286
9.31k
    if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
1287
0
    {
1288
0
        const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
1289
0
        int lastCG = 1;
1290
1291
0
        for (int subSet = realLastScanPos; subSet >= 0; subSet--)
1292
0
        {
1293
0
            int subPos = subSet << LOG2_SCAN_SET_SIZE;
1294
0
            int n;
1295
1296
0
            if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet])))
1297
0
                continue;
1298
1299
            /* measure distance between first and last non-zero coef in this
1300
             * coding group */
1301
0
            const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
1302
0
            const int firstNZPosInCG = (uint8_t)posFirstLast;
1303
0
            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
1304
0
            const uint32_t absSumSign = posFirstLast;
1305
1306
0
            if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
1307
0
            {
1308
0
                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
1309
1310
#if CHECKED_BUILD || _DEBUG
1311
                int32_t absSum_dummy = 0;
1312
                for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
1313
                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
1314
                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
1315
#endif
1316
1317
                //if (signbit != absSumSign)
1318
0
                if (((int32_t)(signbit ^ absSumSign)) < 0)
1319
0
                {
1320
                    /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
1321
                     * is properly implied. Note dstCoeff[] are signed by this point but curChange and
1322
                     * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
1323
1324
0
                    int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
1325
0
                    uint32_t minPos = 0;
1326
0
                    int8_t finalChange = 0;
1327
0
                    int curChange = 0;
1328
0
                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
1329
1330
0
                    for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
1331
0
                    {
1332
0
                        const uint32_t blkPos = codeParams.scan[n + subPos];
1333
0
                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
1334
0
                        const int absLevel = abs(dstCoeff[blkPos]);
1335
                        // TODO: this is constant in non-scaling mode
1336
0
                        const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
1337
0
                        const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
1338
1339
0
                        int d = abs(signCoef) - (unQuantLevel >> unquantShift);
1340
0
                        X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
1341
1342
0
                        const int64_t origDist = (((int64_t)d * d));
1343
1344
0
#define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
1345
1346
0
                        const uint32_t isOne = (absLevel == 1);
1347
0
                        if (dstCoeff[blkPos])
1348
0
                        {
1349
0
                            d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
1350
0
                            X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1351
0
                            int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
1352
1353
                            /* if decrementing would make the coeff 0, we can include the
1354
                             * significant coeff flag cost savings */
1355
0
                            d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
1356
0
                            X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1357
0
                            int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
1358
0
                            int64_t costDown = DELTARDCOST(origDist, d, downBits);
1359
1360
0
                            costDown -= lastCoeffAdjust;
1361
0
                            curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
1362
1363
0
                            curChange = 2 * (costUp < costDown) - 1;
1364
0
                            curCost = (costUp < costDown) ? costUp : curCost;
1365
0
                        }
1366
                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
1367
0
                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
1368
0
                        {
1369
                            /* don't try to make a new coded coeff before the first coeff if its
1370
                             * sign would be different than the first coeff, the inferred sign would
1371
                             * still be wrong and we'd have to do this again. */
1372
0
                            curCost = MAX_INT64;
1373
0
                        }
1374
0
                        else
1375
0
                        {
1376
                            /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
1377
0
                            d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
1378
0
                            X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
1379
0
                            curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
1380
0
                            curChange = 1;
1381
0
                        }
1382
1383
0
                        if (curCost < minCostInc)
1384
0
                        {
1385
0
                            minCostInc = curCost;
1386
0
                            finalChange = (int8_t)curChange;
1387
0
                            minPos = blkPos + (absLevel << 16);
1388
0
                        }
1389
0
                        lastCoeffAdjust = 0;
1390
0
                    }
1391
1392
0
                    const int absInMinPos = (minPos >> 16);
1393
0
                    minPos = (uint16_t)minPos;
1394
1395
                    // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
1396
0
                    if (absInMinPos >= 32767)
1397
                        /* don't allow sign hiding to violate the SPEC range */
1398
0
                        finalChange = -1;
1399
1400
                    // NOTE: Reference code
1401
                    //if (dstCoeff[minPos] == 0)
1402
                    //    numSig++;
1403
                    //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
1404
                    //    numSig--;
1405
0
                    numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
1406
1407
1408
                    // NOTE: Reference code
1409
                    //if (m_resiDctCoeff[minPos] >= 0)
1410
                    //    dstCoeff[minPos] += finalChange;
1411
                    //else
1412
                    //    dstCoeff[minPos] -= finalChange;
1413
0
                    const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
1414
0
                    dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
1415
0
                }
1416
0
            }
1417
1418
0
            lastCG = 0;
1419
0
        }
1420
0
    }
1421
1422
9.28k
    return numSig;
1423
232k
}
unsigned int x265::Quant::rdoQuant<5u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool)
Line
Count
Source
611
20.7k
{
612
20.7k
    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
613
20.7k
    int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
614
20.7k
    const uint32_t usePsyMask = usePsy ? -1 : 0;
615
616
20.7k
    X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
617
618
20.7k
    int rem = m_qpParam[ttype].rem;
619
20.7k
    int per = m_qpParam[ttype].per;
620
20.7k
    int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
621
20.7k
    int add = (1 << (qbits - 1));
622
20.7k
    const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
623
624
20.7k
    const int numCoeff = 1 << (log2TrSize * 2);
625
20.7k
    uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
626
20.7k
    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
627
20.7k
    if (!numSig)
628
18.4k
        return 0;
629
2.31k
    const uint32_t trSize = 1 << log2TrSize;
630
2.31k
    int64_t lambda2 = m_qpParam[ttype].lambda2;
631
2.31k
    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
632
    /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
633
     * scale applied that must be removed during unquant. Note that in real dequant there is clipping
634
     * at several stages. We skip the clipping for simplicity when measuring RD cost */
635
2.31k
    const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
636
2.31k
    int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
637
2.31k
    int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
638
2.31k
    const int scaleBits = SCALE_BITS - 2 * transformShift;
639
640
2.31k
#define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
641
2.31k
#define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
642
2.31k
#define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
643
2.31k
#define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
644
645
2.31k
    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
646
2.31k
    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
647
2.31k
    int64_t costSig[trSize * trSize];     /* lambda * bits       */
648
649
2.31k
    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
650
2.31k
    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
651
2.31k
    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
652
653
2.31k
    int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
654
2.31k
    uint64_t sigCoeffGroupFlag64 = 0;
655
656
2.31k
    const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
657
2.31k
    bool bIsLuma = ttype == TEXT_LUMA;
658
659
    /* total rate distortion cost of transform block, as CBF=0 */
660
2.31k
    int64_t totalUncodedCost = 0;
661
662
    /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
663
     * the distortion and signal cost of coded blocks, and the coding cost of significant
664
     * coefficient and coefficient group bitmaps */
665
2.31k
    int64_t totalRdCost = 0;
666
667
2.31k
    TUEntropyCodingParameters codeParams;
668
2.31k
    cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
669
2.31k
    const uint32_t log2TrSizeCG = log2TrSize - 2;
670
2.31k
    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
671
2.31k
    const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
672
673
2.31k
    uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
674
2.31k
    uint16_t coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
675
2.31k
    uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
676
677
#if CHECKED_BUILD || _DEBUG
678
    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
679
    memset(coeffNum, 0, sizeof(coeffNum) * sizeof(uint8_t));
680
    memset(coeffSign, 0, sizeof(coeffNum) * sizeof(uint16_t));
681
    memset(coeffFlag, 0, sizeof(coeffNum) * sizeof(uint16_t));
682
#endif
683
2.31k
    const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
684
2.31k
    const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
685
686
687
    /* TODO: update bit estimates if dirty */
688
2.31k
    EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
689
690
2.31k
    uint32_t scanPos = 0;
691
2.31k
    uint32_t c1 = 1;
692
693
    // process trail all zero Coeff Group
694
695
    /* coefficients after lastNZ have no distortion signal cost */
696
2.31k
    const int zeroCG = cgNum - 1 - cgLastScanPos;
697
2.31k
    memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
698
2.31k
    memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t));
699
700
    /* sum zero coeff (uncodec) cost */
701
702
    // TODO: does we need these cost?
703
2.31k
    if (usePsyMask)
704
2.31k
    {
705
147k
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
706
145k
        {
707
145k
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
708
145k
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
709
145k
            uint32_t blkPos      = codeParams.scan[scanPosBase];
710
145k
#if X265_ARCH_X86
711
145k
            bool enable512 = detect512();
712
145k
            if (enable512)
713
0
                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
714
145k
            else
715
145k
            {
716
145k
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
717
145k
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
718
145k
            }
719
#else
720
            primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
721
            primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
722
#endif
723
145k
        }
724
2.31k
    }
725
0
    else
726
0
    {
727
        // non-psy path
728
0
        for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
729
0
        {
730
0
            X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
731
0
            uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
732
0
            uint32_t blkPos      = codeParams.scan[scanPosBase];
733
0
            primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
734
0
        }
735
0
    }
736
2.31k
    static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
737
2.31k
    {
738
        // patternSigCtx = 0
739
2.31k
        {
740
2.31k
            2, 1, 1, 0,
741
2.31k
            1, 1, 0, 0,
742
2.31k
            1, 0, 0, 0,
743
2.31k
            0, 0, 0, 0,
744
2.31k
        },
745
        // patternSigCtx = 1
746
2.31k
        {
747
2.31k
            2, 2, 2, 2,
748
2.31k
            1, 1, 1, 1,
749
2.31k
            0, 0, 0, 0,
750
2.31k
            0, 0, 0, 0,
751
2.31k
        },
752
        // patternSigCtx = 2
753
2.31k
        {
754
2.31k
            2, 1, 0, 0,
755
2.31k
            2, 1, 0, 0,
756
2.31k
            2, 1, 0, 0,
757
2.31k
            2, 1, 0, 0,
758
2.31k
        },
759
        // patternSigCtx = 3
760
2.31k
        {
761
2.31k
            2, 2, 2, 2,
762
2.31k
            2, 2, 2, 2,
763
2.31k
            2, 2, 2, 2,
764
2.31k
            2, 2, 2, 2,
765
2.31k
        },
766
        // 4x4
767
2.31k
        {
768
2.31k
            0, 1, 4, 5,
769
2.31k
            2, 3, 4, 5,
770
2.31k
            6, 6, 8, 8,
771
2.31k
            7, 7, 8, 8
772
2.31k
        }
773
2.31k
    };
774
775
    /* iterate over coding groups in reverse scan order */
776
4.62k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--)
777
2.31k
    {
778
2.31k
        uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
779
2.31k
        const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
780
2.31k
        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
781
2.31k
        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
782
2.31k
        const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
783
2.31k
        const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
784
2.31k
        const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
785
786
2.31k
        if (c1 == 0)
787
0
            ctxSet++;
788
2.31k
        c1 = 1;
789
790
2.31k
        if (cgScanPos && (coeffNum[cgScanPos] == 0))
791
0
        {
792
            // TODO: does we need zero-coeff cost?
793
0
            const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
794
0
            uint32_t blkPos = codeParams.scan[scanPosBase];
795
0
            if (usePsyMask)
796
0
            {
797
0
#if X265_ARCH_X86
798
0
                bool enable512 = detect512();
799
0
                if (enable512)
800
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
801
0
                else
802
0
                {
803
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
804
0
                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
805
0
                }
806
#else
807
                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
808
                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
809
#endif
810
0
                blkPos = codeParams.scan[scanPosBase];
811
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
812
0
                {
813
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
814
0
                    {
815
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
816
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
817
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
818
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
819
820
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
821
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
822
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
823
0
                    }
824
0
                    blkPos += trSize;
825
0
                }
826
0
            }
827
0
            else
828
0
            {
829
                // non-psy path
830
0
                primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
831
0
                blkPos = codeParams.scan[scanPosBase];
832
0
                for (int y = 0; y < MLS_CG_SIZE; y++)
833
0
                {
834
0
                    for (int x = 0; x < MLS_CG_SIZE; x++)
835
0
                    {
836
0
                        const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
837
0
                        const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
838
0
                        X265_CHECK(trSize > 4, "trSize check failure\n");
839
0
                        X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
840
841
0
                        costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
842
0
                        costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x];
843
0
                        sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
844
0
                    }
845
0
                    blkPos += trSize;
846
0
                }
847
0
            }
848
849
            /* there were no coded coefficients in this coefficient group */
850
0
            {
851
0
                uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
852
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
853
0
                totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
854
0
            }
855
0
            continue;
856
0
        }
857
858
2.31k
        coeffGroupRDStats cgRdStats;
859
2.31k
        memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
860
861
2.31k
        uint32_t subFlagMask = coeffFlag[cgScanPos];
862
2.31k
        int    c2            = 0;
863
2.31k
        uint32_t goRiceParam = 0;
864
2.31k
        uint32_t levelThreshold = 3;
865
2.31k
        uint32_t c1Idx       = 0;
866
2.31k
        uint32_t c2Idx       = 0;
867
        /* iterate over coefficients in each group in reverse scan order */
868
39.2k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
869
36.9k
        {
870
36.9k
            scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
871
36.9k
            uint32_t blkPos      = codeParams.scan[scanPos];
872
36.9k
            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
873
36.9k
            int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
874
36.9k
            int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
875
876
            /* RDOQ measures distortion as the squared difference between the unquantized coded level
877
             * and the original DCT coefficient. The result is shifted scaleBits to account for the
878
             * FIX15 nature of the CABAC cost tables minus the forward transform scale */
879
880
            /* cost of not coding this coefficient (all distortion, no signal bits) */
881
36.9k
            costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits;
882
36.9k
            X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n");
883
36.9k
            if (usePsyMask & scanPos)
884
                /* when no residual coefficient is coded, predicted coef == recon coef */
885
34.6k
                costUncoded[blkPos] -= PSYVALUE(predictedCoef);
886
887
36.9k
            totalUncodedCost += costUncoded[blkPos];
888
889
            // coefficient level estimation
890
36.9k
            const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
891
            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
892
36.9k
            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
893
36.9k
            uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
894
36.9k
            const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
895
            // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
896
36.9k
            X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
897
898
            // before find lastest non-zero coeff
899
36.9k
            if (scanPos > (uint32_t)lastScanPos)
900
34.6k
            {
901
                /* coefficients after lastNZ have no distortion signal cost */
902
34.6k
                costCoeff[scanPos] = 0;
903
34.6k
                costSig[scanPos] = 0;
904
905
                /* No non-zero coefficient yet found, but this does not mean
906
                 * there is no uncoded-cost for this coefficient. Pre-
907
                 * quantization the coefficient may have been non-zero */
908
34.6k
                totalRdCost += costUncoded[blkPos];
909
34.6k
            }
910
2.31k
            else if (!(subFlagMask & 1))
911
0
            {
912
                // fast zero coeff path
913
                /* set default costs to uncoded costs */
914
0
                costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
915
0
                costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
916
0
                sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
917
0
                totalRdCost += costCoeff[scanPos];
918
0
                rateIncUp[blkPos] = greaterOneBits[0];
919
920
0
                subFlagMask >>= 1;
921
0
            }
922
2.31k
            else
923
2.31k
            {
924
2.31k
                subFlagMask >>= 1;
925
926
2.31k
                const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
927
2.31k
                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
928
929
2.31k
                X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
930
2.31k
                X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
931
2.31k
                X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
932
2.31k
                X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
933
934
                // coefficient level estimation
935
2.31k
                const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
936
2.31k
                const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
937
938
2.31k
                uint32_t level = 0;
939
2.31k
                uint32_t sigCoefBits = 0;
940
2.31k
                costCoeff[scanPos] = MAX_INT64;
941
942
2.31k
                if ((int)scanPos == lastScanPos)
943
2.31k
                    sigRateDelta[blkPos] = 0;
944
0
                else
945
0
                {
946
0
                    if (maxAbsLevel < 3)
947
0
                    {
948
                        /* set default costs to uncoded costs */
949
0
                        costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]);
950
0
                        costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos];
951
0
                    }
952
0
                    sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig];
953
0
                    sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
954
0
                }
955
956
2.31k
                const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
957
                // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
958
2.31k
                if (maxAbsLevel == 1)
959
0
                {
960
0
                    uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
961
0
                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
962
963
0
                    int unquantAbsLevel = unQuantLevel >> unquantShift;
964
0
                    X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
965
0
                    int d = abs(signCoef) - unquantAbsLevel;
966
0
                    int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
967
968
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
969
0
                    if (usePsyMask & scanPos)
970
0
                    {
971
0
                        int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
972
0
                        curCost -= PSYVALUE(reconCoef);
973
0
                    }
974
975
0
                    if (curCost < costCoeff[scanPos])
976
0
                    {
977
0
                        level = 1;
978
0
                        costCoeff[scanPos] = curCost;
979
0
                        costSig[scanPos] = SIGCOST(sigCoefBits);
980
0
                    }
981
0
                }
982
2.31k
                else if (maxAbsLevel)
983
2.31k
                {
984
2.31k
                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
985
2.31k
                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
986
987
2.31k
                    const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
988
989
2.31k
                    const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
990
2.31k
                    X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
991
2.31k
                    int d0 = abs(signCoef) - unquantAbsLevel0;
992
2.31k
                    int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
993
994
2.31k
                    const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
995
2.31k
                    X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
996
2.31k
                    int d1 = abs(signCoef) - unquantAbsLevel1;
997
2.31k
                    int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
998
999
                    /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
1000
2.31k
                    if (usePsyMask & scanPos)
1001
0
                    {
1002
0
                        int reconCoef;
1003
0
                        reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef));
1004
0
                        curCost0 -= PSYVALUE(reconCoef);
1005
1006
0
                        reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef));
1007
0
                        curCost1 -= PSYVALUE(reconCoef);
1008
0
                    }
1009
2.31k
                    if (curCost0 < costCoeff[scanPos])
1010
2.31k
                    {
1011
2.31k
                        level = maxAbsLevel;
1012
2.31k
                        costCoeff[scanPos] = curCost0;
1013
2.31k
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1014
2.31k
                    }
1015
2.31k
                    if (curCost1 < costCoeff[scanPos])
1016
42
                    {
1017
42
                        level = maxAbsLevel - 1;
1018
42
                        costCoeff[scanPos] = curCost1;
1019
42
                        costSig[scanPos] = SIGCOST(sigCoefBits);
1020
42
                    }
1021
2.31k
                }
1022
1023
2.31k
                dstCoeff[blkPos] = (int16_t)level;
1024
2.31k
                totalRdCost += costCoeff[scanPos];
1025
1026
                /* record costs for sign-hiding performed at the end */
1027
2.31k
                if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level)
1028
2.31k
                {
1029
2.31k
                    const int32_t diff0 = level - 1 - baseLevel;
1030
2.31k
                    const int32_t diff2 = level + 1 - baseLevel;
1031
2.31k
                    const int32_t maxVlc = g_goRiceRange[goRiceParam];
1032
2.31k
                    int rate0, rate1, rate2;
1033
1034
2.31k
                    if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
1035
0
                    {
1036
                        // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
1037
                        //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
1038
0
                        X265_CHECK(level == 1, "absLevel check failure\n");
1039
1040
0
                        const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
1041
0
                        const int rateNotEqual2 = greaterOneBits[0];
1042
1043
0
                        rate0 = 0;
1044
0
                        rate2 = rateEqual2;
1045
0
                        rate1 = rateNotEqual2;
1046
1047
0
                        X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1048
0
                        X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1049
0
                        X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
1050
0
                    }
1051
2.31k
                    else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
1052
0
                    {
1053
                        // NOTE: no c1c2 correct rate since all of rate include this factor
1054
0
                        rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
1055
0
                        rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
1056
0
                        rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
1057
0
                    }
1058
2.31k
                    else
1059
2.31k
                    {
1060
2.31k
                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1061
2.31k
                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1062
2.31k
                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
1063
2.31k
                    }
1064
2.31k
                    rateIncUp[blkPos] = rate2 - rate1;
1065
2.31k
                    rateIncDown[blkPos] = rate0 - rate1;
1066
2.31k
                }
1067
0
                else
1068
0
                {
1069
0
                    rateIncUp[blkPos] = greaterOneBits[0];
1070
0
                    rateIncDown[blkPos] = 0;
1071
0
                }
1072
1073
                /* Update CABAC estimation state */
1074
2.31k
                if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
1075
2.31k
                {
1076
2.31k
                    goRiceParam++;
1077
2.31k
                    levelThreshold <<= 1;
1078
2.31k
                }
1079
1080
2.31k
                const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
1081
2.31k
                c1Idx += isNonZero;
1082
1083
                /* update bin model */
1084
2.31k
                if (level > 1)
1085
2.31k
                {
1086
2.31k
                    c1 = 0;
1087
2.31k
                    c2 += (uint32_t)(c2 - 2) >> 31;
1088
2.31k
                    c2Idx++;
1089
2.31k
                }
1090
0
                else if (((c1 == 1) | (c1 == 2)) & isNonZero)
1091
0
                    c1++;
1092
1093
2.31k
                if (dstCoeff[blkPos])
1094
2.31k
                {
1095
2.31k
                    sigCoeffGroupFlag64 |= cgBlkPosMask;
1096
2.31k
                    cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
1097
2.31k
                    cgRdStats.uncodedDist += costUncoded[blkPos];
1098
2.31k
                    cgRdStats.nnzBeforePos0 += scanPosinCG;
1099
2.31k
                }
1100
2.31k
            }
1101
1102
36.9k
            cgRdStats.sigCost += costSig[scanPos];
1103
36.9k
        } /* end for (scanPosinCG) */
1104
1105
2.31k
        X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n");
1106
2.31k
        cgRdStats.sigCost0 = costSig[scanPos];
1107
1108
2.31k
        costCoeffGroupSig[cgScanPos] = 0;
1109
1110
        /* nothing to do at this case */
1111
2.31k
        X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n");
1112
1113
2.31k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1114
2.31k
        {
1115
            /* coeff group 0 is implied to be present, no signal cost */
1116
            /* coeff group with last NZ is implied to be present, handled below */
1117
2.31k
        }
1118
0
        else if (sigCoeffGroupFlag64 & cgBlkPosMask)
1119
0
        {
1120
0
            if (!cgRdStats.nnzBeforePos0)
1121
0
            {
1122
                /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
1123
0
                totalRdCost -= cgRdStats.sigCost0;
1124
0
                cgRdStats.sigCost -= cgRdStats.sigCost0;
1125
0
            }
1126
1127
            /* there are coded coefficients in this group, but now we include the signaling cost
1128
             * of the significant coefficient group flag and evaluate whether the RD cost of the
1129
             * coded group is more than the RD cost of the uncoded group */
1130
1131
0
            uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1132
1133
0
            int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1134
0
            costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
1135
0
            costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
1136
0
            costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
1137
1138
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
1139
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
1140
1141
0
            if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
1142
0
            {
1143
0
                sigCoeffGroupFlag64 &= ~cgBlkPosMask;
1144
0
                totalRdCost = costZeroCG;
1145
0
                costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
1146
1147
                /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
1148
0
                const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize];
1149
0
                memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1150
0
                memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1151
0
                memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1152
0
                memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1153
0
            }
1154
0
        }
1155
0
        else
1156
0
        {
1157
            /* there were no coded coefficients in this coefficient group */
1158
0
            uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
1159
0
            costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
1160
0
            totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
1161
0
            totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
1162
0
        }
1163
2.31k
    } /* end for (cgScanPos) */
1164
1165
2.31k
    X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
1166
1167
    /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
1168
2.31k
    int64_t bestCost;
1169
2.31k
    if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
1170
0
    {
1171
0
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
1172
0
        totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
1173
0
    }
1174
2.31k
    else
1175
2.31k
    {
1176
2.31k
        int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
1177
2.31k
        bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
1178
2.31k
        totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
1179
2.31k
    }
1180
1181
    /* This loop starts with the last non-zero found in the first loop and then refines this last
1182
     * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
1183
     * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
1184
     * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
1185
     * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
1186
2.31k
    int  bestLastIdx = 0;
1187
2.31k
    bool foundLast = false;
1188
4.62k
    for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
1189
2.31k
    {
1190
2.31k
        if (!cgScanPos || cgScanPos == cgLastScanPos)
1191
2.31k
        {
1192
            /* the presence of these coefficient groups are inferred, they have no bit in
1193
             * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
1194
2.31k
        }
1195
0
        else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
1196
0
        {
1197
            /* remove cost of significant coeff group flag, the group's presence would be inferred
1198
             * from lastNZ if it were present in this group */
1199
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1200
0
        }
1201
0
        else
1202
0
        {
1203
            /* remove cost of signaling this empty group as not present */
1204
0
            totalRdCost -= costCoeffGroupSig[cgScanPos];
1205
0
            continue;
1206
0
        }
1207
1208
36.9k
        for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
1209
36.9k
        {
1210
36.9k
            scanPos = cgScanPos * cgSize + scanPosinCG;
1211
36.9k
            if ((int)scanPos > lastScanPos)
1212
34.6k
                continue;
1213
1214
            /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
1215
             * continue as if it were uncoded. If the coefficient was already uncoded, remove the
1216
             * cost of signaling it as not-significant */
1217
2.31k
            uint32_t blkPos = codeParams.scan[scanPos];
1218
2.31k
            if (dstCoeff[blkPos])
1219
2.31k
            {
1220
                // Calculates the cost of signaling the last significant coefficient in the block 
1221
2.31k
                uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) };
1222
2.31k
                if (codeParams.scanType == SCAN_VER)
1223
0
                    std::swap(pos[0], pos[1]);
1224
2.31k
                uint32_t bitsLastNZ = 0;
1225
1226
6.93k
                for (int i = 0; i < 2; i++)
1227
4.62k
                {
1228
4.62k
                    int temp = g_lastCoeffTable[pos[i]];
1229
4.62k
                    int prefixOnes = temp & 15;
1230
4.62k
                    int suffixLen = temp >> 4;
1231
1232
4.62k
                    bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes];
1233
4.62k
                    bitsLastNZ += IEP_RATE * suffixLen;
1234
4.62k
                }
1235
1236
2.31k
                int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
1237
1238
2.31k
                if (costAsLast < bestCost)
1239
2.31k
                {
1240
2.31k
                    bestLastIdx = scanPos + 1;
1241
2.31k
                    bestCost = costAsLast;
1242
2.31k
                }
1243
2.31k
                if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
1244
2.31k
                {
1245
2.31k
                    foundLast = true;
1246
2.31k
                    break;
1247
2.31k
                }
1248
1249
0
                totalRdCost -= costCoeff[scanPos];
1250
0
                totalRdCost += costUncoded[blkPos];
1251
0
            }
1252
0
            else
1253
0
                totalRdCost -= costSig[scanPos];
1254
2.31k
        }
1255
2.31k
    }
1256
1257
    /* recount non-zero coefficients and re-apply sign of DCT coef */
1258
2.31k
    numSig = 0;
1259
4.62k
    for (int pos = 0; pos < bestLastIdx; pos++)
1260
2.31k
    {
1261
2.31k
        int blkPos = codeParams.scan[pos];
1262
2.31k
        int level  = dstCoeff[blkPos];
1263
2.31k
        numSig += (level != 0);
1264
1265
2.31k
        uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
1266
2.31k
        dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
1267
2.31k
    }
1268
1269
    // Average 49.62 pixels
1270
    /* clean uncoded coefficients */
1271
2.31k
    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
1272
36.9k
    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
1273
34.6k
    {
1274
34.6k
        dstCoeff[codeParams.scan[pos]] = 0;
1275
34.6k
    }
1276
2.31k
    for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE)
1277
0
    {
1278
0
        const uint32_t blkPos = codeParams.scan[pos];
1279
0
        memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff));
1280
0
        memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff));
1281
0
        memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff));
1282
0
        memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff));
1283
0
    }
1284
1285
    /* rate-distortion based sign-hiding */
1286
2.31k
    if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
1287
0
    {
1288
0
        const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
1289
0
        int lastCG = 1;
1290
1291
0
        for (int subSet = realLastScanPos; subSet >= 0; subSet--)
1292
0
        {
1293
0
            int subPos = subSet << LOG2_SCAN_SET_SIZE;
1294
0
            int n;
1295
1296
0
            if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet])))
1297
0
                continue;
1298
1299
            /* measure distance between first and last non-zero coef in this
1300
             * coding group */
1301
0
            const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
1302
0
            const int firstNZPosInCG = (uint8_t)posFirstLast;
1303
0
            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
1304
0
            const uint32_t absSumSign = posFirstLast;
1305
1306
0
            if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
1307
0
            {
1308
0
                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
1309
1310
#if CHECKED_BUILD || _DEBUG
1311
                int32_t absSum_dummy = 0;
1312
                for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
1313
                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
1314
                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
1315
#endif
1316
1317
                //if (signbit != absSumSign)
1318
0
                if (((int32_t)(signbit ^ absSumSign)) < 0)
1319
0
                {
1320
                    /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
1321
                     * is properly implied. Note dstCoeff[] are signed by this point but curChange and
1322
                     * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
1323
1324
0
                    int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
1325
0
                    uint32_t minPos = 0;
1326
0
                    int8_t finalChange = 0;
1327
0
                    int curChange = 0;
1328
0
                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
1329
1330
0
                    for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
1331
0
                    {
1332
0
                        const uint32_t blkPos = codeParams.scan[n + subPos];
1333
0
                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
1334
0
                        const int absLevel = abs(dstCoeff[blkPos]);
1335
                        // TODO: this is constant in non-scaling mode
1336
0
                        const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
1337
0
                        const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
1338
1339
0
                        int d = abs(signCoef) - (unQuantLevel >> unquantShift);
1340
0
                        X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
1341
1342
0
                        const int64_t origDist = (((int64_t)d * d));
1343
1344
0
#define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
1345
1346
0
                        const uint32_t isOne = (absLevel == 1);
1347
0
                        if (dstCoeff[blkPos])
1348
0
                        {
1349
0
                            d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
1350
0
                            X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1351
0
                            int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
1352
1353
                            /* if decrementing would make the coeff 0, we can include the
1354
                             * significant coeff flag cost savings */
1355
0
                            d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
1356
0
                            X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
1357
0
                            int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
1358
0
                            int64_t costDown = DELTARDCOST(origDist, d, downBits);
1359
1360
0
                            costDown -= lastCoeffAdjust;
1361
0
                            curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
1362
1363
0
                            curChange = 2 * (costUp < costDown) - 1;
1364
0
                            curCost = (costUp < costDown) ? costUp : curCost;
1365
0
                        }
1366
                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
1367
0
                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
1368
0
                        {
1369
                            /* don't try to make a new coded coeff before the first coeff if its
1370
                             * sign would be different than the first coeff, the inferred sign would
1371
                             * still be wrong and we'd have to do this again. */
1372
0
                            curCost = MAX_INT64;
1373
0
                        }
1374
0
                        else
1375
0
                        {
1376
                            /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
1377
0
                            d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
1378
0
                            X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
1379
0
                            curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
1380
0
                            curChange = 1;
1381
0
                        }
1382
1383
0
                        if (curCost < minCostInc)
1384
0
                        {
1385
0
                            minCostInc = curCost;
1386
0
                            finalChange = (int8_t)curChange;
1387
0
                            minPos = blkPos + (absLevel << 16);
1388
0
                        }
1389
0
                        lastCoeffAdjust = 0;
1390
0
                    }
1391
1392
0
                    const int absInMinPos = (minPos >> 16);
1393
0
                    minPos = (uint16_t)minPos;
1394
1395
                    // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
1396
0
                    if (absInMinPos >= 32767)
1397
                        /* don't allow sign hiding to violate the SPEC range */
1398
0
                        finalChange = -1;
1399
1400
                    // NOTE: Reference code
1401
                    //if (dstCoeff[minPos] == 0)
1402
                    //    numSig++;
1403
                    //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
1404
                    //    numSig--;
1405
0
                    numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
1406
1407
1408
                    // NOTE: Reference code
1409
                    //if (m_resiDctCoeff[minPos] >= 0)
1410
                    //    dstCoeff[minPos] += finalChange;
1411
                    //else
1412
                    //    dstCoeff[minPos] -= finalChange;
1413
0
                    const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
1414
0
                    dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
1415
0
                }
1416
0
            }
1417
1418
0
            lastCG = 0;
1419
0
        }
1420
0
    }
1421
1422
2.31k
    return numSig;
1423
20.7k
}
1424
1425
/* Context derivation process of coeff_abs_significant_flag */
1426
uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma,
1427
                             uint32_t firstSignificanceMapContext)
1428
0
{
1429
0
    static const uint8_t ctxIndMap[16] =
1430
0
    {
1431
0
        0, 1, 4, 5,
1432
0
        2, 3, 4, 5,
1433
0
        6, 6, 8, 8,
1434
0
        7, 7, 8, 8
1435
0
    };
1436
1437
0
    if (!blkPos) // special case for the DC context variable
1438
0
        return 0;
1439
1440
0
    if (log2TrSize == 2) // 4x4
1441
0
        return ctxIndMap[blkPos];
1442
1443
0
    const uint32_t posY = blkPos >> log2TrSize;
1444
0
    const uint32_t posX = blkPos & (trSize - 1);
1445
0
    X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n");
1446
1447
0
    int posXinSubset = blkPos & 3;
1448
0
    X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n");
1449
0
    int posYinSubset = posY & 3;
1450
1451
    // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
1452
0
    static const uint8_t table_cnt[4][4][4] =
1453
0
    {
1454
        // patternSigCtx = 0
1455
0
        {
1456
0
            { 2, 1, 1, 0 },
1457
0
            { 1, 1, 0, 0 },
1458
0
            { 1, 0, 0, 0 },
1459
0
            { 0, 0, 0, 0 },
1460
0
        },
1461
        // patternSigCtx = 1
1462
0
        {
1463
0
            { 2, 1, 0, 0 },
1464
0
            { 2, 1, 0, 0 },
1465
0
            { 2, 1, 0, 0 },
1466
0
            { 2, 1, 0, 0 },
1467
0
        },
1468
        // patternSigCtx = 2
1469
0
        {
1470
0
            { 2, 2, 2, 2 },
1471
0
            { 1, 1, 1, 1 },
1472
0
            { 0, 0, 0, 0 },
1473
0
            { 0, 0, 0, 0 },
1474
0
        },
1475
        // patternSigCtx = 3
1476
0
        {
1477
0
            { 2, 2, 2, 2 },
1478
0
            { 2, 2, 2, 2 },
1479
0
            { 2, 2, 2, 2 },
1480
0
            { 2, 2, 2, 2 },
1481
0
        }
1482
0
    };
1483
1484
0
    int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset];
1485
0
    int offset = firstSignificanceMapContext;
1486
1487
0
    offset += cnt;
1488
1489
0
    return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset;
1490
0
}
1491