/src/x265/source/common/quant.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * Min Chen <chenm003@163.com> |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 2 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
20 | | * |
21 | | * This program is also available under a commercial proprietary license. |
22 | | * For more information, contact us at license @ x265.com. |
23 | | *****************************************************************************/ |
24 | | |
25 | | #include "common.h" |
26 | | #include "primitives.h" |
27 | | #include "quant.h" |
28 | | #include "framedata.h" |
29 | | #include "entropy.h" |
30 | | #include "yuv.h" |
31 | | #include "cudata.h" |
32 | | #include "contexts.h" |
33 | | |
34 | | using namespace X265_NS; |
35 | | |
36 | 111k | #define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) |
37 | | |
38 | | namespace { |
39 | | |
40 | | struct coeffGroupRDStats |
41 | | { |
42 | | int nnzBeforePos0; /* indicates coeff other than pos 0 are coded */ |
43 | | int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */ |
44 | | int64_t uncodedDist; /* uncoded distortion cost of coded coefficients */ |
45 | | int64_t sigCost; /* cost of signaling significant coeff bitmap */ |
46 | | int64_t sigCost0; /* cost of signaling sig coeff bit of coeff 0 */ |
47 | | }; |
48 | | |
49 | | inline int fastMin(int x, int y) |
50 | 1.00M | { |
51 | 1.00M | return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) |
52 | 1.00M | } |
53 | | |
54 | | inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate) |
55 | 152k | { |
56 | 152k | X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); |
57 | 152k | if (!absLevel) |
58 | 1.95k | { |
59 | 1.95k | X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); |
60 | 1.95k | return 0; |
61 | 1.95k | } |
62 | 150k | int rate = 0; |
63 | | |
64 | 150k | if (diffLevel < 0) |
65 | 12.8k | { |
66 | 12.8k | X265_CHECK(absLevel <= 2, "absLevel check failure\n"); |
67 | 12.8k | rate += greaterOneBits[(absLevel == 2)]; |
68 | | |
69 | 12.8k | if (absLevel == 2) |
70 | 4.42k | rate += levelAbsBits[0]; |
71 | 12.8k | } |
72 | 137k | else |
73 | 137k | { |
74 | 137k | uint32_t symbol = diffLevel; |
75 | 137k | bool expGolomb = (symbol > maxVlc); |
76 | | |
77 | 137k | if (expGolomb) |
78 | 117k | { |
79 | 117k | absLevel = symbol - maxVlc; |
80 | | |
81 | | // NOTE: mapping to x86 hardware instruction BSR |
82 | 117k | unsigned long size; |
83 | 117k | CLZ(size, absLevel); |
84 | 117k | int egs = size * 2 + 1; |
85 | | |
86 | 117k | rate += egs << 15; |
87 | | |
88 | | // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1) |
89 | 117k | X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n"); |
90 | 117k | symbol = maxVlc + 1; |
91 | 117k | } |
92 | | |
93 | 137k | uint32_t prefLen = (symbol >> absGoRice) + 1; |
94 | 137k | uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); |
95 | | |
96 | 137k | rate += numBins << 15; |
97 | 137k | rate += c1c2Rate; |
98 | 137k | } |
99 | 150k | return rate; |
100 | 152k | } |
101 | | |
102 | | #if CHECKED_BUILD || _DEBUG |
103 | | inline int getICRateNegDiff(uint32_t absLevel, const int* greaterOneBits, const int* levelAbsBits) |
104 | | { |
105 | | X265_CHECK(absLevel <= 2, "absLevel check failure\n"); |
106 | | |
107 | | int rate; |
108 | | if (absLevel == 0) |
109 | | rate = 0; |
110 | | else if (absLevel == 2) |
111 | | rate = greaterOneBits[1] + levelAbsBits[0]; |
112 | | else |
113 | | rate = greaterOneBits[0]; |
114 | | return rate; |
115 | | } |
116 | | #endif |
117 | | |
118 | | inline int getICRateLessVlc(uint32_t absLevel, int32_t diffLevel, const uint32_t absGoRice) |
119 | 102k | { |
120 | 102k | X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); |
121 | 102k | if (!absLevel) |
122 | 0 | { |
123 | 0 | X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); |
124 | 0 | return 0; |
125 | 0 | } |
126 | 102k | int rate; |
127 | | |
128 | 102k | uint32_t symbol = diffLevel; |
129 | 102k | uint32_t prefLen = (symbol >> absGoRice) + 1; |
130 | 102k | uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); |
131 | | |
132 | 102k | rate = numBins << 15; |
133 | | |
134 | 102k | return rate; |
135 | 102k | } |
136 | | |
137 | | /* Calculates the cost for specific absolute transform level */ |
138 | | inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate) |
139 | 168k | { |
140 | 168k | X265_CHECK(absLevel, "absLevel should not be zero\n"); |
141 | | |
142 | 168k | if (diffLevel < 0) |
143 | 13.0k | { |
144 | 13.0k | X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n"); |
145 | | |
146 | 13.0k | uint32_t rate = greaterOneBits[(absLevel == 2)]; |
147 | 13.0k | if (absLevel == 2) |
148 | 5.59k | rate += levelAbsBits[0]; |
149 | 13.0k | return rate; |
150 | 13.0k | } |
151 | 155k | else |
152 | 155k | { |
153 | 155k | uint32_t rate; |
154 | 155k | uint32_t symbol = diffLevel; |
155 | 155k | if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION) |
156 | 53.3k | { |
157 | 53.3k | uint32_t length = symbol >> absGoRice; |
158 | 53.3k | rate = (length + 1 + absGoRice) << 15; |
159 | 53.3k | } |
160 | 102k | else |
161 | 102k | { |
162 | 102k | uint32_t length = 0; |
163 | 102k | symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION; |
164 | 102k | if (symbol) |
165 | 93.5k | { |
166 | 93.5k | unsigned long idx; |
167 | 93.5k | CLZ(idx, symbol + 1); |
168 | 93.5k | length = idx; |
169 | 93.5k | } |
170 | | |
171 | 102k | rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15; |
172 | 102k | } |
173 | 155k | rate += c1c2Rate; |
174 | 155k | return rate; |
175 | 155k | } |
176 | 168k | } |
177 | | |
178 | | } |
179 | | |
180 | | Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>}; |
181 | | |
182 | | Quant::Quant() |
183 | 22.6k | { |
184 | 22.6k | m_resiDctCoeff = NULL; |
185 | 22.6k | m_fencDctCoeff = NULL; |
186 | 22.6k | m_fencShortBuf = NULL; |
187 | 22.6k | m_frameNr = NULL; |
188 | 22.6k | m_nr = NULL; |
189 | 22.6k | } |
190 | | |
191 | | bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy) |
192 | 22.6k | { |
193 | 22.6k | m_entropyCoder = &entropy; |
194 | 22.6k | m_psyRdoqScale = (int32_t)(psyScale * 256.0); |
195 | 22.6k | X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n"); |
196 | 22.6k | m_scalingList = &scalingList; |
197 | 22.6k | m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); |
198 | 22.6k | m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); |
199 | 22.6k | m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); |
200 | | |
201 | 22.6k | return m_resiDctCoeff && m_fencShortBuf; |
202 | 22.6k | } |
203 | | |
204 | | bool Quant::allocNoiseReduction(const x265_param& param) |
205 | 0 | { |
206 | 0 | m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads); |
207 | 0 | if (m_frameNr) |
208 | 0 | memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads); |
209 | 0 | else |
210 | 0 | return false; |
211 | 0 | return true; |
212 | 0 | } |
213 | | |
214 | | Quant::~Quant() |
215 | 22.6k | { |
216 | 22.6k | X265_FREE(m_frameNr); |
217 | 22.6k | X265_FREE(m_resiDctCoeff); |
218 | 22.6k | X265_FREE(m_fencShortBuf); |
219 | 22.6k | } |
220 | | |
221 | | void Quant::setQPforQuant(const CUData& ctu, int qp) |
222 | 28.1k | { |
223 | 28.1k | m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL; |
224 | 28.1k | m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET); |
225 | 28.1k | m_rdoqLevel = ctu.m_encData->m_param->rdoqLevel; |
226 | 28.1k | if (ctu.m_chromaFormat != X265_CSP_I400) |
227 | 28.1k | { |
228 | 28.1k | setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0] + ctu.m_slice->m_chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat); |
229 | 28.1k | setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1] + ctu.m_slice->m_chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat); |
230 | 28.1k | } |
231 | 28.1k | } |
232 | | |
233 | | void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) |
234 | 56.3k | { |
235 | 56.3k | int qp = x265_clip3(-QP_BD_OFFSET, 57, qpin); |
236 | 56.3k | if (qp >= 30) |
237 | 13.0k | { |
238 | 13.0k | if (chFmt == X265_CSP_I420) |
239 | 13.0k | qp = g_chromaScale[qp]; |
240 | 0 | else |
241 | 0 | qp = X265_MIN(qp, QP_MAX_SPEC); |
242 | 13.0k | } |
243 | 56.3k | m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET); |
244 | 56.3k | } |
245 | | |
246 | | /* To minimize the distortion only. No rate is considered */ |
247 | | uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams, uint32_t log2TrSize) |
248 | 0 | { |
249 | 0 | uint32_t trSize = 1 << log2TrSize; |
250 | 0 | const uint16_t* scan = codeParams.scan; |
251 | |
|
252 | 0 | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] |
253 | 0 | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign |
254 | 0 | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff |
255 | |
|
256 | | #if CHECKED_BUILD || _DEBUG |
257 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group |
258 | | memset(coeffNum, 0, sizeof(coeffNum)); |
259 | | memset(coeffSign, 0, sizeof(coeffNum)); |
260 | | memset(coeffFlag, 0, sizeof(coeffNum)); |
261 | | #endif |
262 | 0 | const int lastScanPos = primitives.scanPosLast(codeParams.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); |
263 | 0 | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); |
264 | 0 | unsigned long tmp; |
265 | | |
266 | | // first CG need specially processing |
267 | 0 | const uint32_t correctOffset = 0x0F & (lastScanPos ^ 0xF); |
268 | 0 | coeffFlag[cgLastScanPos] <<= correctOffset; |
269 | |
|
270 | 0 | for (int cg = cgLastScanPos; cg >= 0; cg--) |
271 | 0 | { |
272 | 0 | int cgStartPos = cg << LOG2_SCAN_SET_SIZE; |
273 | 0 | int n; |
274 | |
|
275 | | #if CHECKED_BUILD || _DEBUG |
276 | | for (n = SCAN_SET_SIZE - 1; n >= 0; --n) |
277 | | if (coeff[scan[n + cgStartPos]]) |
278 | | break; |
279 | | int lastNZPosInCG0 = n; |
280 | | #endif |
281 | |
|
282 | 0 | if (coeffNum[cg] == 0) |
283 | 0 | { |
284 | 0 | X265_CHECK(lastNZPosInCG0 < 0, "all zero block check failure\n"); |
285 | 0 | continue; |
286 | 0 | } |
287 | | |
288 | | #if CHECKED_BUILD || _DEBUG |
289 | | for (n = 0;; n++) |
290 | | if (coeff[scan[n + cgStartPos]]) |
291 | | break; |
292 | | |
293 | | int firstNZPosInCG0 = n; |
294 | | #endif |
295 | | |
296 | 0 | CLZ(tmp, coeffFlag[cg]); |
297 | 0 | const int firstNZPosInCG = (15 ^ tmp); |
298 | |
|
299 | 0 | CTZ(tmp, coeffFlag[cg]); |
300 | 0 | const int lastNZPosInCG = (15 ^ tmp); |
301 | |
|
302 | 0 | X265_CHECK(firstNZPosInCG0 == firstNZPosInCG, "firstNZPosInCG0 check failure\n"); |
303 | 0 | X265_CHECK(lastNZPosInCG0 == lastNZPosInCG, "lastNZPosInCG0 check failure\n"); |
304 | |
|
305 | 0 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) |
306 | 0 | { |
307 | 0 | uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1; |
308 | 0 | uint32_t absSum = 0; |
309 | |
|
310 | 0 | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) |
311 | 0 | absSum += coeff[scan[n + cgStartPos]]; |
312 | |
|
313 | 0 | if (signbit != (absSum & 0x1)) // compare signbit with sum_parity |
314 | 0 | { |
315 | 0 | int minCostInc = MAX_INT, minPos = -1, curCost = MAX_INT; |
316 | 0 | int32_t finalChange = 0, curChange = 0; |
317 | 0 | uint32_t cgFlags = coeffFlag[cg]; |
318 | 0 | if (cg == cgLastScanPos) |
319 | 0 | cgFlags >>= correctOffset; |
320 | |
|
321 | 0 | for (n = (cg == cgLastScanPos ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) |
322 | 0 | { |
323 | 0 | uint32_t blkPos = scan[n + cgStartPos]; |
324 | 0 | X265_CHECK(!!coeff[blkPos] == !!(cgFlags & 1), "non zero coeff check failure\n"); |
325 | |
|
326 | 0 | if (cgFlags & 1) |
327 | 0 | { |
328 | 0 | if (deltaU[blkPos] > 0) |
329 | 0 | { |
330 | 0 | curCost = -deltaU[blkPos]; |
331 | 0 | curChange = 1; |
332 | 0 | } |
333 | 0 | else |
334 | 0 | { |
335 | 0 | if ((cgFlags == 1) && (abs(coeff[blkPos]) == 1)) |
336 | 0 | { |
337 | 0 | X265_CHECK(n == firstNZPosInCG, "firstNZPosInCG position check failure\n"); |
338 | 0 | curCost = MAX_INT; |
339 | 0 | } |
340 | 0 | else |
341 | 0 | { |
342 | 0 | curCost = deltaU[blkPos]; |
343 | 0 | curChange = -1; |
344 | 0 | } |
345 | 0 | } |
346 | 0 | } |
347 | 0 | else |
348 | 0 | { |
349 | 0 | if (cgFlags == 0) |
350 | 0 | { |
351 | 0 | X265_CHECK(n < firstNZPosInCG, "firstNZPosInCG position check failure\n"); |
352 | 0 | uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1; |
353 | 0 | if (thisSignBit != signbit) |
354 | 0 | curCost = MAX_INT; |
355 | 0 | else |
356 | 0 | { |
357 | 0 | curCost = -deltaU[blkPos]; |
358 | 0 | curChange = 1; |
359 | 0 | } |
360 | 0 | } |
361 | 0 | else |
362 | 0 | { |
363 | 0 | curCost = -deltaU[blkPos]; |
364 | 0 | curChange = 1; |
365 | 0 | } |
366 | 0 | } |
367 | |
|
368 | 0 | if (curCost < minCostInc) |
369 | 0 | { |
370 | 0 | minCostInc = curCost; |
371 | 0 | finalChange = curChange; |
372 | 0 | minPos = blkPos; |
373 | 0 | } |
374 | 0 | cgFlags>>=1; |
375 | 0 | } |
376 | | |
377 | | /* do not allow change to violate coeff clamp */ |
378 | 0 | if (coeff[minPos] == 32767 || coeff[minPos] == -32768) |
379 | 0 | finalChange = -1; |
380 | |
|
381 | 0 | if (!coeff[minPos]) |
382 | 0 | numSig++; |
383 | 0 | else if (finalChange == -1 && abs(coeff[minPos]) == 1) |
384 | 0 | numSig--; |
385 | |
|
386 | 0 | { |
387 | 0 | const int16_t sigMask = ((int16_t)m_resiDctCoeff[minPos]) >> 15; |
388 | 0 | coeff[minPos] += ((int16_t)finalChange ^ sigMask) - sigMask; |
389 | 0 | } |
390 | 0 | } |
391 | 0 | } |
392 | 0 | } |
393 | |
|
394 | 0 | return numSig; |
395 | 0 | } |
396 | | |
397 | | uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, |
398 | | coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip) |
399 | 13.0M | { |
400 | 13.0M | const uint32_t sizeIdx = log2TrSize - 2; |
401 | | |
402 | 13.0M | if (cu.m_tqBypass[0]) |
403 | 4.04M | { |
404 | 4.04M | X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n"); |
405 | 4.04M | return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride); |
406 | 4.04M | } |
407 | | |
408 | 8.98M | bool isLuma = ttype == TEXT_LUMA; |
409 | 8.98M | bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip; |
410 | 8.98M | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform |
411 | | |
412 | 8.98M | X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n"); |
413 | 8.98M | if (useTransformSkip) |
414 | 0 | { |
415 | 0 | #if X265_DEPTH <= 10 |
416 | 0 | X265_CHECK(transformShift >= 0, "invalid transformShift\n"); |
417 | 0 | primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift); |
418 | | #else |
419 | | if (transformShift >= 0) |
420 | | primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift); |
421 | | else |
422 | | primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift); |
423 | | #endif |
424 | 0 | } |
425 | 8.98M | else |
426 | 8.98M | { |
427 | 8.98M | bool isIntra = cu.isIntra(absPartIdx); |
428 | | |
429 | 8.98M | if (!sizeIdx && isLuma && isIntra) |
430 | 2.80M | primitives.dst4x4(residual, m_resiDctCoeff, resiStride); |
431 | 6.17M | else |
432 | 6.17M | primitives.cu[sizeIdx].dct(residual, m_resiDctCoeff, resiStride); |
433 | | |
434 | | /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so |
435 | | * there is no risk of performing this DCT unnecessarily */ |
436 | 8.98M | if (usePsy) |
437 | 3.68M | { |
438 | 3.68M | int trSize = 1 << log2TrSize; |
439 | | /* perform DCT on source pixels for psy-rdoq */ |
440 | 3.68M | primitives.cu[sizeIdx].copy_ps(m_fencShortBuf, trSize, fenc, fencStride); |
441 | 3.68M | primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize); |
442 | 3.68M | } |
443 | | |
444 | 8.98M | if (m_nr && m_nr->offset) |
445 | 0 | { |
446 | | /* denoise is not applied to intra residual, so DST can be ignored */ |
447 | 0 | int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra; |
448 | 0 | int numCoeff = 1 << (log2TrSize * 2); |
449 | 0 | primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff); |
450 | 0 | m_nr->count[cat]++; |
451 | 0 | } |
452 | 8.98M | } |
453 | | |
454 | 8.98M | if (m_rdoqLevel) |
455 | 9.00M | return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy); |
456 | 18.4E | else |
457 | 18.4E | { |
458 | 18.4E | int deltaU[32 * 32]; |
459 | | |
460 | 18.4E | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; |
461 | 18.4E | int rem = m_qpParam[ttype].rem; |
462 | 18.4E | int per = m_qpParam[ttype].per; |
463 | 18.4E | const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; |
464 | | |
465 | 18.4E | int qbits = QUANT_SHIFT + per + transformShift; |
466 | 18.4E | int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9); |
467 | 18.4E | int numCoeff = 1 << (log2TrSize * 2); |
468 | | |
469 | 18.4E | uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff); |
470 | | |
471 | 18.4E | if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled) |
472 | 0 | { |
473 | 0 | TUEntropyCodingParameters codeParams; |
474 | 0 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma); |
475 | 0 | return signBitHidingHDQ(coeff, deltaU, numSig, codeParams, log2TrSize); |
476 | 0 | } |
477 | 18.4E | else |
478 | 18.4E | return numSig; |
479 | 18.4E | } |
480 | 8.98M | } |
481 | | |
482 | | uint64_t Quant::ssimDistortion(const CUData& cu, const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx) |
483 | 0 | { |
484 | 0 | static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416 |
485 | 0 | static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963 |
486 | 0 | int shift = (X265_DEPTH - 8); |
487 | |
|
488 | 0 | int trSize = 1 << log2TrSize; |
489 | 0 | uint64_t ssDc = 0, ssBlock = 0, ssAc = 0; |
490 | | |
491 | | // Calculation of (X(0) - Y(0)) * (X(0) - Y(0)), DC |
492 | 0 | ssDc = 0; |
493 | 0 | for (int y = 0; y < trSize; y += 4) |
494 | 0 | { |
495 | 0 | for (int x = 0; x < trSize; x += 4) |
496 | 0 | { |
497 | 0 | int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff |
498 | 0 | ssDc += temp * temp; |
499 | 0 | } |
500 | 0 | } |
501 | | |
502 | | // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC |
503 | 0 | ssBlock = 0; |
504 | 0 | uint64_t ac_k = 0; |
505 | 0 | primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride, &ssBlock, shift, &ac_k); |
506 | 0 | ssAc = ssBlock - ssDc; |
507 | | |
508 | | // 1. Calculation of fdc' |
509 | | // Calculate numerator of dc normalization factor |
510 | 0 | uint64_t fDc_num = 0; |
511 | | |
512 | | // 2. Calculate dc component |
513 | 0 | uint64_t dc_k = 0; |
514 | 0 | for (int block_yy = 0; block_yy < trSize; block_yy += 4) |
515 | 0 | { |
516 | 0 | for (int block_xx = 0; block_xx < trSize; block_xx += 4) |
517 | 0 | { |
518 | 0 | uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift; |
519 | 0 | dc_k += temp * temp; |
520 | 0 | } |
521 | 0 | } |
522 | |
|
523 | 0 | fDc_num = (2 * dc_k) + (trSize * trSize * ssim_c1); // 16 pixels -> for each 4x4 block |
524 | 0 | fDc_num /= ((trSize >> 2) * (trSize >> 2)); |
525 | | |
526 | | // 1. Calculation of fac' |
527 | | // Calculate numerator of ac normalization factor |
528 | 0 | uint64_t fAc_num = 0; |
529 | | |
530 | | // 2. Calculate ac component |
531 | 0 | ac_k -= dc_k; |
532 | |
|
533 | 0 | double s = 1 + 0.005 * cu.m_qp[absPartIdx]; |
534 | |
|
535 | 0 | fAc_num = ac_k + uint64_t(s * ac_k) + ssim_c2; |
536 | 0 | fAc_num /= ((trSize >> 2) * (trSize >> 2)); |
537 | | |
538 | | // Calculate dc and ac normalization factor |
539 | 0 | uint64_t ssim_distortion = ((ssDc * cu.m_fDc_den[ttype]) / fDc_num) + ((ssAc * cu.m_fAc_den[ttype]) / fAc_num); |
540 | 0 | return ssim_distortion; |
541 | 0 | } |
542 | | |
543 | | void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff, |
544 | | uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) |
545 | 53.4k | { |
546 | 53.4k | const uint32_t sizeIdx = log2TrSize - 2; |
547 | 53.4k | if (cu.m_tqBypass[0]) |
548 | 14.1k | { |
549 | 14.1k | primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0); |
550 | 14.1k | return; |
551 | 14.1k | } |
552 | | // Values need to pass as input parameter in dequant |
553 | 39.2k | int rem = m_qpParam[ttype].rem; |
554 | 39.2k | int per = m_qpParam[ttype].per; |
555 | 39.2k | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; |
556 | 39.2k | int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; |
557 | 39.2k | int numCoeff = 1 << (log2TrSize * 2); |
558 | | |
559 | 39.2k | if (m_scalingList->m_bEnabled) |
560 | 0 | { |
561 | 0 | int scalingListType = (bIntra ? 0 : 3) + ttype; |
562 | 0 | const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem]; |
563 | 0 | primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift); |
564 | 0 | } |
565 | 39.2k | else |
566 | 39.2k | { |
567 | 39.2k | int scale = m_scalingList->s_invQuantScales[rem] << per; |
568 | 39.2k | primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift); |
569 | 39.2k | } |
570 | | |
571 | 39.2k | if (useTransformSkip) |
572 | 0 | { |
573 | 0 | #if X265_DEPTH <= 10 |
574 | 0 | X265_CHECK(transformShift > 0, "invalid transformShift\n"); |
575 | 0 | primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift); |
576 | | #else |
577 | | if (transformShift > 0) |
578 | | primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift); |
579 | | else |
580 | | primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift); |
581 | | #endif |
582 | 0 | } |
583 | 39.2k | else |
584 | 39.2k | { |
585 | 39.2k | int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; |
586 | 39.2k | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n"); |
587 | | // DC only |
588 | 39.2k | if (numSig == 1 && coeff[0] != 0 && !useDST) |
589 | 33.0k | { |
590 | 33.0k | const int shift_1st = 7 - 6; |
591 | 33.0k | const int add_1st = 1 << (shift_1st - 1); |
592 | 33.0k | const int shift_2nd = 12 - (X265_DEPTH - 8) - 3; |
593 | 33.0k | const int add_2nd = 1 << (shift_2nd - 1); |
594 | | |
595 | 33.0k | int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd; |
596 | 33.0k | primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val); |
597 | 33.0k | return; |
598 | 33.0k | } |
599 | | |
600 | 6.20k | if (useDST) |
601 | 6.20k | primitives.idst4x4(m_resiDctCoeff, residual, resiStride); |
602 | 0 | else |
603 | 0 | primitives.cu[sizeIdx].idct(m_resiDctCoeff, residual, resiStride); |
604 | 6.20k | } |
605 | 39.2k | } |
606 | | |
607 | | /* Rate distortion optimized quantization for entropy coding engines using |
608 | | * probability models like CABAC */ |
609 | | template<uint32_t log2TrSize> |
610 | | uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy) |
611 | 9.02M | { |
612 | 9.02M | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ |
613 | 9.02M | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; |
614 | 9.02M | const uint32_t usePsyMask = usePsy ? -1 : 0; |
615 | | |
616 | 9.02M | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); |
617 | | |
618 | 9.02M | int rem = m_qpParam[ttype].rem; |
619 | 9.02M | int per = m_qpParam[ttype].per; |
620 | 9.02M | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ |
621 | 9.02M | int add = (1 << (qbits - 1)); |
622 | 9.02M | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; |
623 | | |
624 | 9.02M | const int numCoeff = 1 << (log2TrSize * 2); |
625 | 9.02M | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); |
626 | 9.02M | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); |
627 | 9.02M | if (!numSig) |
628 | 8.93M | return 0; |
629 | 89.0k | const uint32_t trSize = 1 << log2TrSize; |
630 | 89.0k | int64_t lambda2 = m_qpParam[ttype].lambda2; |
631 | 89.0k | int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); |
632 | | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) |
633 | | * scale applied that must be removed during unquant. Note that in real dequant there is clipping |
634 | | * at several stages. We skip the clipping for simplicity when measuring RD cost */ |
635 | 89.0k | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; |
636 | 89.0k | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); |
637 | 89.0k | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; |
638 | 89.0k | const int scaleBits = SCALE_BITS - 2 * transformShift; |
639 | | |
640 | 89.0k | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) |
641 | 569k | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) |
642 | 197k | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) |
643 | 421k | #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) |
644 | | |
645 | 89.0k | int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ |
646 | 89.0k | int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ |
647 | 89.0k | int64_t costSig[trSize * trSize]; /* lambda * bits */ |
648 | | |
649 | 89.0k | int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ |
650 | 89.0k | int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */ |
651 | 89.0k | int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ |
652 | | |
653 | 89.0k | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ |
654 | 89.0k | uint64_t sigCoeffGroupFlag64 = 0; |
655 | | |
656 | 89.0k | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ |
657 | 89.0k | bool bIsLuma = ttype == TEXT_LUMA; |
658 | | |
659 | | /* total rate distortion cost of transform block, as CBF=0 */ |
660 | 89.0k | int64_t totalUncodedCost = 0; |
661 | | |
662 | | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, |
663 | | * the distortion and signal cost of coded blocks, and the coding cost of significant |
664 | | * coefficient and coefficient group bitmaps */ |
665 | 89.0k | int64_t totalRdCost = 0; |
666 | | |
667 | 89.0k | TUEntropyCodingParameters codeParams; |
668 | 89.0k | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); |
669 | 89.0k | const uint32_t log2TrSizeCG = log2TrSize - 2; |
670 | 89.0k | const uint32_t cgNum = 1 << (log2TrSizeCG * 2); |
671 | 89.0k | const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE); |
672 | | |
673 | 89.0k | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] |
674 | 89.0k | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign |
675 | 89.0k | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff |
676 | | |
677 | | #if CHECKED_BUILD || _DEBUG |
678 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group |
679 | | memset(coeffNum, 0, sizeof(coeffNum)); |
680 | | memset(coeffSign, 0, sizeof(coeffNum)); |
681 | | memset(coeffFlag, 0, sizeof(coeffNum)); |
682 | | #endif |
683 | 89.0k | const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); |
684 | 89.0k | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); |
685 | | |
686 | | |
687 | | /* TODO: update bit estimates if dirty */ |
688 | 89.0k | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; |
689 | | |
690 | 89.0k | uint32_t scanPos = 0; |
691 | 89.0k | uint32_t c1 = 1; |
692 | | |
693 | | // process trail all zero Coeff Group |
694 | | |
695 | | /* coefficients after lastNZ have no distortion signal cost */ |
696 | 89.0k | const int zeroCG = cgNum - 1 - cgLastScanPos; |
697 | 89.0k | memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); |
698 | 89.0k | memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); |
699 | | |
700 | | /* sum zero coeff (uncodec) cost */ |
701 | | |
702 | | // TODO: does we need these cost? |
703 | 89.0k | if (usePsyMask) |
704 | 20.6k | { |
705 | 297k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) |
706 | 276k | { |
707 | 276k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); |
708 | 276k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); |
709 | 276k | uint32_t blkPos = codeParams.scan[scanPosBase]; |
710 | 276k | #if X265_ARCH_X86 |
711 | 276k | bool enable512 = detect512(); |
712 | 276k | if (enable512) |
713 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
714 | 276k | else |
715 | 276k | { |
716 | 276k | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos); |
717 | 276k | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
718 | 276k | } |
719 | | #else |
720 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
721 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
722 | | #endif |
723 | 276k | } |
724 | 20.6k | } |
725 | 68.4k | else |
726 | 68.4k | { |
727 | | // non-psy path |
728 | 174k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) |
729 | 106k | { |
730 | 106k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); |
731 | 106k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); |
732 | 106k | uint32_t blkPos = codeParams.scan[scanPosBase]; |
733 | 106k | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
734 | 106k | } |
735 | 68.4k | } |
736 | 89.0k | static const uint8_t table_cnt[5][SCAN_SET_SIZE] = |
737 | 89.0k | { |
738 | | // patternSigCtx = 0 |
739 | 89.0k | { |
740 | 89.0k | 2, 1, 1, 0, |
741 | 89.0k | 1, 1, 0, 0, |
742 | 89.0k | 1, 0, 0, 0, |
743 | 89.0k | 0, 0, 0, 0, |
744 | 89.0k | }, |
745 | | // patternSigCtx = 1 |
746 | 89.0k | { |
747 | 89.0k | 2, 2, 2, 2, |
748 | 89.0k | 1, 1, 1, 1, |
749 | 89.0k | 0, 0, 0, 0, |
750 | 89.0k | 0, 0, 0, 0, |
751 | 89.0k | }, |
752 | | // patternSigCtx = 2 |
753 | 89.0k | { |
754 | 89.0k | 2, 1, 0, 0, |
755 | 89.0k | 2, 1, 0, 0, |
756 | 89.0k | 2, 1, 0, 0, |
757 | 89.0k | 2, 1, 0, 0, |
758 | 89.0k | }, |
759 | | // patternSigCtx = 3 |
760 | 89.0k | { |
761 | 89.0k | 2, 2, 2, 2, |
762 | 89.0k | 2, 2, 2, 2, |
763 | 89.0k | 2, 2, 2, 2, |
764 | 89.0k | 2, 2, 2, 2, |
765 | 89.0k | }, |
766 | | // 4x4 |
767 | 89.0k | { |
768 | 89.0k | 0, 1, 4, 5, |
769 | 89.0k | 2, 3, 4, 5, |
770 | 89.0k | 6, 6, 8, 8, |
771 | 89.0k | 7, 7, 8, 8 |
772 | 89.0k | } |
773 | 89.0k | }; |
774 | | |
775 | | /* iterate over coding groups in reverse scan order */ |
776 | 139k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--) |
777 | 50.7k | { |
778 | 50.7k | uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0; |
779 | 50.7k | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; |
780 | 50.7k | const uint32_t cgPosY = cgBlkPos >> log2TrSizeCG; |
781 | 50.7k | const uint32_t cgPosX = cgBlkPos & ((1 << log2TrSizeCG) - 1); |
782 | 50.7k | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); |
783 | 50.7k | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
784 | 50.7k | const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0); |
785 | | |
786 | 50.7k | if (c1 == 0) |
787 | 0 | ctxSet++; |
788 | 50.7k | c1 = 1; |
789 | | |
790 | 50.7k | if (cgScanPos && (coeffNum[cgScanPos] == 0)) |
791 | 0 | { |
792 | | // TODO: does we need zero-coeff cost? |
793 | 0 | const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); |
794 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; |
795 | 0 | if (usePsyMask) |
796 | 0 | { |
797 | 0 | #if X265_ARCH_X86 |
798 | 0 | bool enable512 = detect512(); |
799 | 0 | if (enable512) |
800 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
801 | 0 | else |
802 | 0 | { |
803 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
804 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
805 | 0 | } |
806 | | #else |
807 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
808 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
809 | | #endif |
810 | 0 | blkPos = codeParams.scan[scanPosBase]; |
811 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
812 | 0 | { |
813 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
814 | 0 | { |
815 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; |
816 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; |
817 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); |
818 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); |
819 | |
|
820 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
821 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; |
822 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
823 | 0 | } |
824 | 0 | blkPos += trSize; |
825 | 0 | } |
826 | 0 | } |
827 | 0 | else |
828 | 0 | { |
829 | | // non-psy path |
830 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
831 | 0 | blkPos = codeParams.scan[scanPosBase]; |
832 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
833 | 0 | { |
834 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
835 | 0 | { |
836 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; |
837 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; |
838 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); |
839 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); |
840 | |
|
841 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
842 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; |
843 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
844 | 0 | } |
845 | 0 | blkPos += trSize; |
846 | 0 | } |
847 | 0 | } |
848 | | |
849 | | /* there were no coded coefficients in this coefficient group */ |
850 | 0 | { |
851 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
852 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); |
853 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ |
854 | 0 | } |
855 | 0 | continue; |
856 | 0 | } |
857 | | |
858 | 50.7k | coeffGroupRDStats cgRdStats; |
859 | 50.7k | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); |
860 | | |
861 | 50.7k | uint32_t subFlagMask = coeffFlag[cgScanPos]; |
862 | 50.7k | int c2 = 0; |
863 | 50.7k | uint32_t goRiceParam = 0; |
864 | 50.7k | uint32_t levelThreshold = 3; |
865 | 50.7k | uint32_t c1Idx = 0; |
866 | 50.7k | uint32_t c2Idx = 0; |
867 | | /* iterate over coefficients in each group in reverse scan order */ |
868 | 862k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) |
869 | 812k | { |
870 | 812k | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; |
871 | 812k | uint32_t blkPos = codeParams.scan[scanPos]; |
872 | 812k | uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */ |
873 | 812k | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ |
874 | 812k | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ |
875 | | |
876 | | /* RDOQ measures distortion as the squared difference between the unquantized coded level |
877 | | * and the original DCT coefficient. The result is shifted scaleBits to account for the |
878 | | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ |
879 | | |
880 | | /* cost of not coding this coefficient (all distortion, no signal bits) */ |
881 | 812k | costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; |
882 | 812k | X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); |
883 | 812k | if (usePsyMask & scanPos) |
884 | | /* when no residual coefficient is coded, predicted coef == recon coef */ |
885 | 309k | costUncoded[blkPos] -= PSYVALUE(predictedCoef); |
886 | | |
887 | 812k | totalUncodedCost += costUncoded[blkPos]; |
888 | | |
889 | | // coefficient level estimation |
890 | 812k | const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1]; |
891 | | //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset; |
892 | 812k | static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL}; |
893 | 812k | uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx]; |
894 | 812k | const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset; |
895 | | // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset' |
896 | 812k | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); |
897 | | |
898 | | // before find lastest non-zero coeff |
899 | 812k | if (scanPos > (uint32_t)lastScanPos) |
900 | 697k | { |
901 | | /* coefficients after lastNZ have no distortion signal cost */ |
902 | 697k | costCoeff[scanPos] = 0; |
903 | 697k | costSig[scanPos] = 0; |
904 | | |
905 | | /* No non-zero coefficient yet found, but this does not mean |
906 | | * there is no uncoded-cost for this coefficient. Pre- |
907 | | * quantization the coefficient may have been non-zero */ |
908 | 697k | totalRdCost += costUncoded[blkPos]; |
909 | 697k | } |
910 | 114k | else if (!(subFlagMask & 1)) |
911 | 2.02k | { |
912 | | // fast zero coeff path |
913 | | /* set default costs to uncoded costs */ |
914 | 2.02k | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
915 | 2.02k | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; |
916 | 2.02k | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
917 | 2.02k | totalRdCost += costCoeff[scanPos]; |
918 | 2.02k | rateIncUp[blkPos] = greaterOneBits[0]; |
919 | | |
920 | 2.02k | subFlagMask >>= 1; |
921 | 2.02k | } |
922 | 112k | else |
923 | 112k | { |
924 | 112k | subFlagMask >>= 1; |
925 | | |
926 | 112k | const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; |
927 | 112k | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3; // {1, 2, 1, 3} |
928 | | |
929 | 112k | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); |
930 | 112k | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); |
931 | 112k | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); |
932 | 112k | X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n"); |
933 | | |
934 | | // coefficient level estimation |
935 | 112k | const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2]; |
936 | 112k | const uint32_t c1c2Rate = ((c1c2idx & 1) ? greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0); |
937 | | |
938 | 112k | uint32_t level = 0; |
939 | 112k | uint32_t sigCoefBits = 0; |
940 | 112k | costCoeff[scanPos] = MAX_INT64; |
941 | | |
942 | 112k | if ((int)scanPos == lastScanPos) |
943 | 50.7k | sigRateDelta[blkPos] = 0; |
944 | 62.1k | else |
945 | 62.1k | { |
946 | 62.1k | if (maxAbsLevel < 3) |
947 | 16.3k | { |
948 | | /* set default costs to uncoded costs */ |
949 | 16.3k | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
950 | 16.3k | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; |
951 | 16.3k | } |
952 | 62.1k | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
953 | 62.1k | sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; |
954 | 62.1k | } |
955 | | |
956 | 112k | const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound); |
957 | | // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1) | (0 < X < 2 ==> X=1) |
958 | 112k | if (maxAbsLevel == 1) |
959 | 28.6k | { |
960 | 28.6k | uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE; |
961 | 28.6k | X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n"); |
962 | | |
963 | 28.6k | int unquantAbsLevel = unQuantLevel >> unquantShift; |
964 | 28.6k | X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n"); |
965 | 28.6k | int d = abs(signCoef) - unquantAbsLevel; |
966 | 28.6k | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); |
967 | | |
968 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ |
969 | 28.6k | if (usePsyMask & scanPos) |
970 | 12.9k | { |
971 | 12.9k | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); |
972 | 12.9k | curCost -= PSYVALUE(reconCoef); |
973 | 12.9k | } |
974 | | |
975 | 28.6k | if (curCost < costCoeff[scanPos]) |
976 | 27.6k | { |
977 | 27.6k | level = 1; |
978 | 27.6k | costCoeff[scanPos] = curCost; |
979 | 27.6k | costSig[scanPos] = SIGCOST(sigCoefBits); |
980 | 27.6k | } |
981 | 28.6k | } |
982 | 84.2k | else if (maxAbsLevel) |
983 | 84.3k | { |
984 | 84.3k | uint32_t levelBits0 = getICRateCost(maxAbsLevel, maxAbsLevel - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; |
985 | 84.3k | uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; |
986 | | |
987 | 84.3k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); |
988 | | |
989 | 84.3k | const int unquantAbsLevel0 = unQuantLevel >> unquantShift; |
990 | 84.3k | X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n"); |
991 | 84.3k | int d0 = abs(signCoef) - unquantAbsLevel0; |
992 | 84.3k | int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0); |
993 | | |
994 | 84.3k | const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift; |
995 | 84.3k | X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n"); |
996 | 84.3k | int d1 = abs(signCoef) - unquantAbsLevel1; |
997 | 84.3k | int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1); |
998 | | |
999 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ |
1000 | 84.3k | if (usePsyMask & scanPos) |
1001 | 49.3k | { |
1002 | 49.3k | int reconCoef; |
1003 | 49.3k | reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef)); |
1004 | 49.3k | curCost0 -= PSYVALUE(reconCoef); |
1005 | | |
1006 | 49.3k | reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef)); |
1007 | 49.3k | curCost1 -= PSYVALUE(reconCoef); |
1008 | 49.3k | } |
1009 | 84.3k | if (curCost0 < costCoeff[scanPos]) |
1010 | 84.3k | { |
1011 | 84.3k | level = maxAbsLevel; |
1012 | 84.3k | costCoeff[scanPos] = curCost0; |
1013 | 84.3k | costSig[scanPos] = SIGCOST(sigCoefBits); |
1014 | 84.3k | } |
1015 | 84.3k | if (curCost1 < costCoeff[scanPos]) |
1016 | 2.30k | { |
1017 | 2.30k | level = maxAbsLevel - 1; |
1018 | 2.30k | costCoeff[scanPos] = curCost1; |
1019 | 2.30k | costSig[scanPos] = SIGCOST(sigCoefBits); |
1020 | 2.30k | } |
1021 | 84.3k | } |
1022 | | |
1023 | 112k | dstCoeff[blkPos] = (int16_t)level; |
1024 | 112k | totalRdCost += costCoeff[scanPos]; |
1025 | | |
1026 | | /* record costs for sign-hiding performed at the end */ |
1027 | 18.4E | if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level) |
1028 | 111k | { |
1029 | 111k | const int32_t diff0 = level - 1 - baseLevel; |
1030 | 111k | const int32_t diff2 = level + 1 - baseLevel; |
1031 | 111k | const int32_t maxVlc = g_goRiceRange[goRiceParam]; |
1032 | 111k | int rate0, rate1, rate2; |
1033 | | |
1034 | 111k | if (diff0 < -2) // prob (92.9, 86.5, 74.5)% |
1035 | 27.0k | { |
1036 | | // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} |
1037 | | // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 |
1038 | 27.0k | X265_CHECK(level == 1, "absLevel check failure\n"); |
1039 | | |
1040 | 27.0k | const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; |
1041 | 27.0k | const int rateNotEqual2 = greaterOneBits[0]; |
1042 | | |
1043 | 27.0k | rate0 = 0; |
1044 | 27.0k | rate2 = rateEqual2; |
1045 | 27.0k | rate1 = rateNotEqual2; |
1046 | | |
1047 | 27.0k | X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); |
1048 | 27.0k | X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); |
1049 | 27.0k | X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); |
1050 | 27.0k | } |
1051 | 84.9k | else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% |
1052 | 34.1k | { |
1053 | | // NOTE: no c1c2 correct rate since all of rate include this factor |
1054 | 34.1k | rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); |
1055 | 34.1k | rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); |
1056 | 34.1k | rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); |
1057 | 34.1k | } |
1058 | 50.8k | else |
1059 | 50.8k | { |
1060 | 50.8k | rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); |
1061 | 50.8k | rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); |
1062 | 50.8k | rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); |
1063 | 50.8k | } |
1064 | 111k | rateIncUp[blkPos] = rate2 - rate1; |
1065 | 111k | rateIncDown[blkPos] = rate0 - rate1; |
1066 | 111k | } |
1067 | 956 | else |
1068 | 956 | { |
1069 | 956 | rateIncUp[blkPos] = greaterOneBits[0]; |
1070 | 956 | rateIncDown[blkPos] = 0; |
1071 | 956 | } |
1072 | | |
1073 | | /* Update CABAC estimation state */ |
1074 | 112k | if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold)) |
1075 | 46.7k | { |
1076 | 46.7k | goRiceParam++; |
1077 | 46.7k | levelThreshold <<= 1; |
1078 | 46.7k | } |
1079 | | |
1080 | 112k | const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31; |
1081 | 112k | c1Idx += isNonZero; |
1082 | | |
1083 | | /* update bin model */ |
1084 | 112k | if (level > 1) |
1085 | 82.9k | { |
1086 | 82.9k | c1 = 0; |
1087 | 82.9k | c2 += (uint32_t)(c2 - 2) >> 31; |
1088 | 82.9k | c2Idx++; |
1089 | 82.9k | } |
1090 | 29.9k | else if (((c1 == 1) | (c1 == 2)) & isNonZero) |
1091 | 22.9k | c1++; |
1092 | | |
1093 | 112k | if (dstCoeff[blkPos]) |
1094 | 111k | { |
1095 | 111k | sigCoeffGroupFlag64 |= cgBlkPosMask; |
1096 | 111k | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; |
1097 | 111k | cgRdStats.uncodedDist += costUncoded[blkPos]; |
1098 | 111k | cgRdStats.nnzBeforePos0 += scanPosinCG; |
1099 | 111k | } |
1100 | 112k | } |
1101 | | |
1102 | 812k | cgRdStats.sigCost += costSig[scanPos]; |
1103 | 812k | } /* end for (scanPosinCG) */ |
1104 | | |
1105 | 50.7k | X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n"); |
1106 | 50.7k | cgRdStats.sigCost0 = costSig[scanPos]; |
1107 | | |
1108 | 50.7k | costCoeffGroupSig[cgScanPos] = 0; |
1109 | | |
1110 | | /* nothing to do at this case */ |
1111 | 50.7k | X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n"); |
1112 | | |
1113 | 50.7k | if (!cgScanPos || cgScanPos == cgLastScanPos) |
1114 | 50.7k | { |
1115 | | /* coeff group 0 is implied to be present, no signal cost */ |
1116 | | /* coeff group with last NZ is implied to be present, handled below */ |
1117 | 50.7k | } |
1118 | 18.4E | else if (sigCoeffGroupFlag64 & cgBlkPosMask) |
1119 | 0 | { |
1120 | 0 | if (!cgRdStats.nnzBeforePos0) |
1121 | 0 | { |
1122 | | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ |
1123 | 0 | totalRdCost -= cgRdStats.sigCost0; |
1124 | 0 | cgRdStats.sigCost -= cgRdStats.sigCost0; |
1125 | 0 | } |
1126 | | |
1127 | | /* there are coded coefficients in this group, but now we include the signaling cost |
1128 | | * of the significant coefficient group flag and evaluate whether the RD cost of the |
1129 | | * coded group is more than the RD cost of the uncoded group */ |
1130 | |
|
1131 | 0 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
1132 | |
|
1133 | 0 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); |
1134 | 0 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ |
1135 | 0 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ |
1136 | 0 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ |
1137 | |
|
1138 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); |
1139 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ |
1140 | |
|
1141 | 0 | if (costZeroCG < totalRdCost && m_rdoqLevel > 1) |
1142 | 0 | { |
1143 | 0 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; |
1144 | 0 | totalRdCost = costZeroCG; |
1145 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); |
1146 | | |
1147 | | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ |
1148 | 0 | const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize]; |
1149 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1150 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1151 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1152 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1153 | 0 | } |
1154 | 0 | } |
1155 | 18.4E | else |
1156 | 18.4E | { |
1157 | | /* there were no coded coefficients in this coefficient group */ |
1158 | 18.4E | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
1159 | 18.4E | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); |
1160 | 18.4E | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ |
1161 | 18.4E | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ |
1162 | 18.4E | } |
1163 | 50.7k | } /* end for (cgScanPos) */ |
1164 | | |
1165 | 89.0k | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); |
1166 | | |
1167 | | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ |
1168 | 89.0k | int64_t bestCost; |
1169 | 89.0k | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) |
1170 | 0 | { |
1171 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); |
1172 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); |
1173 | 0 | } |
1174 | 89.0k | else |
1175 | 89.0k | { |
1176 | 89.0k | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; |
1177 | 89.0k | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); |
1178 | 89.0k | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); |
1179 | 89.0k | } |
1180 | | |
1181 | | /* This loop starts with the last non-zero found in the first loop and then refines this last |
1182 | | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs |
1183 | | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out |
1184 | | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty |
1185 | | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ |
1186 | 89.0k | int bestLastIdx = 0; |
1187 | 89.0k | bool foundLast = false; |
1188 | 139k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) |
1189 | 50.7k | { |
1190 | 50.7k | if (!cgScanPos || cgScanPos == cgLastScanPos) |
1191 | 50.7k | { |
1192 | | /* the presence of these coefficient groups are inferred, they have no bit in |
1193 | | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ |
1194 | 50.7k | } |
1195 | 0 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) |
1196 | 0 | { |
1197 | | /* remove cost of significant coeff group flag, the group's presence would be inferred |
1198 | | * from lastNZ if it were present in this group */ |
1199 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; |
1200 | 0 | } |
1201 | 0 | else |
1202 | 0 | { |
1203 | | /* remove cost of signaling this empty group as not present */ |
1204 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; |
1205 | 0 | continue; |
1206 | 0 | } |
1207 | | |
1208 | 777k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) |
1209 | 761k | { |
1210 | 761k | scanPos = cgScanPos * cgSize + scanPosinCG; |
1211 | 761k | if ((int)scanPos > lastScanPos) |
1212 | 697k | continue; |
1213 | | |
1214 | | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then |
1215 | | * continue as if it were uncoded. If the coefficient was already uncoded, remove the |
1216 | | * cost of signaling it as not-significant */ |
1217 | 63.9k | uint32_t blkPos = codeParams.scan[scanPos]; |
1218 | 63.9k | if (dstCoeff[blkPos]) |
1219 | 61.9k | { |
1220 | | // Calculates the cost of signaling the last significant coefficient in the block |
1221 | 61.9k | uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) }; |
1222 | 61.9k | if (codeParams.scanType == SCAN_VER) |
1223 | 4.46k | std::swap(pos[0], pos[1]); |
1224 | 61.9k | uint32_t bitsLastNZ = 0; |
1225 | | |
1226 | 185k | for (int i = 0; i < 2; i++) |
1227 | 123k | { |
1228 | 123k | int temp = g_lastCoeffTable[pos[i]]; |
1229 | 123k | int prefixOnes = temp & 15; |
1230 | 123k | int suffixLen = temp >> 4; |
1231 | | |
1232 | 123k | bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes]; |
1233 | 123k | bitsLastNZ += IEP_RATE * suffixLen; |
1234 | 123k | } |
1235 | | |
1236 | 61.9k | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); |
1237 | | |
1238 | 61.9k | if (costAsLast < bestCost) |
1239 | 41.0k | { |
1240 | 41.0k | bestLastIdx = scanPos + 1; |
1241 | 41.0k | bestCost = costAsLast; |
1242 | 41.0k | } |
1243 | 61.9k | if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) |
1244 | 34.9k | { |
1245 | 34.9k | foundLast = true; |
1246 | 34.9k | break; |
1247 | 34.9k | } |
1248 | | |
1249 | 27.0k | totalRdCost -= costCoeff[scanPos]; |
1250 | 27.0k | totalRdCost += costUncoded[blkPos]; |
1251 | 27.0k | } |
1252 | 2.01k | else |
1253 | 2.01k | totalRdCost -= costSig[scanPos]; |
1254 | 63.9k | } |
1255 | 50.7k | } |
1256 | | |
1257 | | /* recount non-zero coefficients and re-apply sign of DCT coef */ |
1258 | 89.0k | numSig = 0; |
1259 | 189k | for (int pos = 0; pos < bestLastIdx; pos++) |
1260 | 100k | { |
1261 | 100k | int blkPos = codeParams.scan[pos]; |
1262 | 100k | int level = dstCoeff[blkPos]; |
1263 | 100k | numSig += (level != 0); |
1264 | | |
1265 | 100k | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; |
1266 | 100k | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); |
1267 | 100k | } |
1268 | | |
1269 | | // Average 49.62 pixels |
1270 | | /* clean uncoded coefficients */ |
1271 | 89.0k | X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n"); |
1272 | 800k | for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++) |
1273 | 711k | { |
1274 | 711k | dstCoeff[codeParams.scan[pos]] = 0; |
1275 | 711k | } |
1276 | 89.0k | for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE) |
1277 | 0 | { |
1278 | 0 | const uint32_t blkPos = codeParams.scan[pos]; |
1279 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1280 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1281 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1282 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1283 | 0 | } |
1284 | | |
1285 | | /* rate-distortion based sign-hiding */ |
1286 | 89.0k | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) |
1287 | 4.99k | { |
1288 | 4.99k | const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE; |
1289 | 4.99k | int lastCG = 1; |
1290 | | |
1291 | 9.98k | for (int subSet = realLastScanPos; subSet >= 0; subSet--) |
1292 | 4.99k | { |
1293 | 4.99k | int subPos = subSet << LOG2_SCAN_SET_SIZE; |
1294 | 4.99k | int n; |
1295 | | |
1296 | 4.99k | if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet]))) |
1297 | 0 | continue; |
1298 | | |
1299 | | /* measure distance between first and last non-zero coef in this |
1300 | | * coding group */ |
1301 | 4.99k | const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]); |
1302 | 4.99k | const int firstNZPosInCG = (uint8_t)posFirstLast; |
1303 | 4.99k | const int lastNZPosInCG = (int8_t)(posFirstLast >> 8); |
1304 | 4.99k | const uint32_t absSumSign = posFirstLast; |
1305 | | |
1306 | 4.99k | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) |
1307 | 4.86k | { |
1308 | 4.86k | const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]); |
1309 | | |
1310 | | #if CHECKED_BUILD || _DEBUG |
1311 | | int32_t absSum_dummy = 0; |
1312 | | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) |
1313 | | absSum_dummy += dstCoeff[codeParams.scan[n + subPos]]; |
1314 | | X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n"); |
1315 | | #endif |
1316 | | |
1317 | | //if (signbit != absSumSign) |
1318 | 4.86k | if (((int32_t)(signbit ^ absSumSign)) < 0) |
1319 | 2.27k | { |
1320 | | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff |
1321 | | * is properly implied. Note dstCoeff[] are signed by this point but curChange and |
1322 | | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ |
1323 | | |
1324 | 2.27k | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; |
1325 | 2.27k | uint32_t minPos = 0; |
1326 | 2.27k | int8_t finalChange = 0; |
1327 | 2.27k | int curChange = 0; |
1328 | 2.27k | uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE; |
1329 | | |
1330 | 32.0k | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) |
1331 | 29.7k | { |
1332 | 29.7k | const uint32_t blkPos = codeParams.scan[n + subPos]; |
1333 | 29.7k | const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ |
1334 | 29.7k | const int absLevel = abs(dstCoeff[blkPos]); |
1335 | | // TODO: this is constant in non-scaling mode |
1336 | 29.7k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); |
1337 | 29.7k | const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound); |
1338 | | |
1339 | 29.7k | int d = abs(signCoef) - (unQuantLevel >> unquantShift); |
1340 | 29.7k | X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n"); |
1341 | | |
1342 | 29.7k | const int64_t origDist = (((int64_t)d * d)); |
1343 | | |
1344 | 58.4k | #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8)) |
1345 | | |
1346 | 29.7k | const uint32_t isOne = (absLevel == 1); |
1347 | 29.7k | if (dstCoeff[blkPos]) |
1348 | 28.6k | { |
1349 | 28.6k | d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift); |
1350 | 28.6k | X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); |
1351 | 28.6k | int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]); |
1352 | | |
1353 | | /* if decrementing would make the coeff 0, we can include the |
1354 | | * significant coeff flag cost savings */ |
1355 | 28.6k | d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift); |
1356 | 28.6k | X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); |
1357 | 28.6k | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); |
1358 | 28.6k | int64_t costDown = DELTARDCOST(origDist, d, downBits); |
1359 | | |
1360 | 28.6k | costDown -= lastCoeffAdjust; |
1361 | 28.6k | curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown; |
1362 | | |
1363 | 28.6k | curChange = 2 * (costUp < costDown) - 1; |
1364 | 28.6k | curCost = (costUp < costDown) ? costUp : curCost; |
1365 | 28.6k | } |
1366 | | //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31))) |
1367 | 1.06k | else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0)) |
1368 | 0 | { |
1369 | | /* don't try to make a new coded coeff before the first coeff if its |
1370 | | * sign would be different than the first coeff, the inferred sign would |
1371 | | * still be wrong and we'd have to do this again. */ |
1372 | 0 | curCost = MAX_INT64; |
1373 | 0 | } |
1374 | 1.06k | else |
1375 | 1.06k | { |
1376 | | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ |
1377 | 1.06k | d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift); |
1378 | 1.06k | X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n"); |
1379 | 1.06k | curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); |
1380 | 1.06k | curChange = 1; |
1381 | 1.06k | } |
1382 | | |
1383 | 29.7k | if (curCost < minCostInc) |
1384 | 7.87k | { |
1385 | 7.87k | minCostInc = curCost; |
1386 | 7.87k | finalChange = (int8_t)curChange; |
1387 | 7.87k | minPos = blkPos + (absLevel << 16); |
1388 | 7.87k | } |
1389 | 29.7k | lastCoeffAdjust = 0; |
1390 | 29.7k | } |
1391 | | |
1392 | 2.27k | const int absInMinPos = (minPos >> 16); |
1393 | 2.27k | minPos = (uint16_t)minPos; |
1394 | | |
1395 | | // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) |
1396 | 2.27k | if (absInMinPos >= 32767) |
1397 | | /* don't allow sign hiding to violate the SPEC range */ |
1398 | 0 | finalChange = -1; |
1399 | | |
1400 | | // NOTE: Reference code |
1401 | | //if (dstCoeff[minPos] == 0) |
1402 | | // numSig++; |
1403 | | //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) |
1404 | | // numSig--; |
1405 | 2.27k | numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1)); |
1406 | | |
1407 | | |
1408 | | // NOTE: Reference code |
1409 | | //if (m_resiDctCoeff[minPos] >= 0) |
1410 | | // dstCoeff[minPos] += finalChange; |
1411 | | //else |
1412 | | // dstCoeff[minPos] -= finalChange; |
1413 | 2.27k | const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16); |
1414 | 2.27k | dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign); |
1415 | 2.27k | } |
1416 | 4.86k | } |
1417 | | |
1418 | 4.99k | lastCG = 0; |
1419 | 4.99k | } |
1420 | 4.99k | } |
1421 | | |
1422 | 89.0k | return numSig; |
1423 | 9.02M | } unsigned int x265::Quant::rdoQuant<2u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Line | Count | Source | 611 | 7.45M | { | 612 | 7.45M | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ | 613 | 7.45M | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; | 614 | 7.45M | const uint32_t usePsyMask = usePsy ? -1 : 0; | 615 | | | 616 | 7.45M | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); | 617 | | | 618 | 7.45M | int rem = m_qpParam[ttype].rem; | 619 | 7.45M | int per = m_qpParam[ttype].per; | 620 | 7.45M | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ | 621 | 7.45M | int add = (1 << (qbits - 1)); | 622 | 7.45M | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; | 623 | | | 624 | 7.45M | const int numCoeff = 1 << (log2TrSize * 2); | 625 | 7.45M | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); | 626 | 7.45M | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); | 627 | 7.45M | if (!numSig) | 628 | 7.40M | return 0; | 629 | 55.1k | const uint32_t trSize = 1 << log2TrSize; | 630 | 55.1k | int64_t lambda2 = m_qpParam[ttype].lambda2; | 631 | 55.1k | int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); | 632 | | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) | 633 | | * scale applied that must be removed during unquant. Note that in real dequant there is clipping | 634 | | * at several stages. We skip the clipping for simplicity when measuring RD cost */ | 635 | 55.1k | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; | 636 | 55.1k | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); | 637 | 55.1k | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; | 638 | 55.1k | const int scaleBits = SCALE_BITS - 2 * transformShift; | 639 | | | 640 | 55.1k | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) | 641 | 55.1k | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) | 642 | 55.1k | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) | 643 | 55.1k | #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) | 644 | | | 645 | 55.1k | int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ | 646 | 55.1k | int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ | 647 | 55.1k | int64_t costSig[trSize * trSize]; /* lambda * bits */ | 648 | | | 649 | 55.1k | int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ | 650 | 55.1k | int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */ | 651 | 55.1k | int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ | 652 | | | 653 | 55.1k | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ | 654 | 55.1k | uint64_t sigCoeffGroupFlag64 = 0; | 655 | | | 656 | 55.1k | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ | 657 | 55.1k | bool bIsLuma = ttype == TEXT_LUMA; | 658 | | | 659 | | /* total rate distortion cost of transform block, as CBF=0 */ | 660 | 55.1k | int64_t totalUncodedCost = 0; | 661 | | | 662 | | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, | 663 | | * the distortion and signal cost of coded blocks, and the coding cost of significant | 664 | | * coefficient and coefficient group bitmaps */ | 665 | 55.1k | int64_t totalRdCost = 0; | 666 | | | 667 | 55.1k | TUEntropyCodingParameters codeParams; | 668 | 55.1k | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); | 669 | 55.1k | const uint32_t log2TrSizeCG = log2TrSize - 2; | 670 | 55.1k | const uint32_t cgNum = 1 << (log2TrSizeCG * 2); | 671 | 55.1k | const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE); | 672 | | | 673 | 55.1k | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] | 674 | 55.1k | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign | 675 | 55.1k | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff | 676 | | | 677 | | #if CHECKED_BUILD || _DEBUG | 678 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group | 679 | | memset(coeffNum, 0, sizeof(coeffNum)); | 680 | | memset(coeffSign, 0, sizeof(coeffNum)); | 681 | | memset(coeffFlag, 0, sizeof(coeffNum)); | 682 | | #endif | 683 | 55.1k | const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); | 684 | 55.1k | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); | 685 | | | 686 | | | 687 | | /* TODO: update bit estimates if dirty */ | 688 | 55.1k | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; | 689 | | | 690 | 55.1k | uint32_t scanPos = 0; | 691 | 55.1k | uint32_t c1 = 1; | 692 | | | 693 | | // process trail all zero Coeff Group | 694 | | | 695 | | /* coefficients after lastNZ have no distortion signal cost */ | 696 | 55.1k | const int zeroCG = cgNum - 1 - cgLastScanPos; | 697 | 55.1k | memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 698 | 55.1k | memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 699 | | | 700 | | /* sum zero coeff (uncodec) cost */ | 701 | | | 702 | | // TODO: does we need these cost? | 703 | 55.1k | if (usePsyMask) | 704 | 6.44k | { | 705 | 6.44k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 706 | 0 | { | 707 | 0 | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 708 | 0 | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 709 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 710 | 0 | #if X265_ARCH_X86 | 711 | 0 | bool enable512 = detect512(); | 712 | 0 | if (enable512) | 713 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 714 | 0 | else | 715 | 0 | { | 716 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos); | 717 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 718 | 0 | } | 719 | | #else | 720 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 721 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 722 | | #endif | 723 | 0 | } | 724 | 6.44k | } | 725 | 48.7k | else | 726 | 48.7k | { | 727 | | // non-psy path | 728 | 48.7k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 729 | 0 | { | 730 | 0 | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 731 | 0 | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 732 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 733 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 734 | 0 | } | 735 | 48.7k | } | 736 | 55.1k | static const uint8_t table_cnt[5][SCAN_SET_SIZE] = | 737 | 55.1k | { | 738 | | // patternSigCtx = 0 | 739 | 55.1k | { | 740 | 55.1k | 2, 1, 1, 0, | 741 | 55.1k | 1, 1, 0, 0, | 742 | 55.1k | 1, 0, 0, 0, | 743 | 55.1k | 0, 0, 0, 0, | 744 | 55.1k | }, | 745 | | // patternSigCtx = 1 | 746 | 55.1k | { | 747 | 55.1k | 2, 2, 2, 2, | 748 | 55.1k | 1, 1, 1, 1, | 749 | 55.1k | 0, 0, 0, 0, | 750 | 55.1k | 0, 0, 0, 0, | 751 | 55.1k | }, | 752 | | // patternSigCtx = 2 | 753 | 55.1k | { | 754 | 55.1k | 2, 1, 0, 0, | 755 | 55.1k | 2, 1, 0, 0, | 756 | 55.1k | 2, 1, 0, 0, | 757 | 55.1k | 2, 1, 0, 0, | 758 | 55.1k | }, | 759 | | // patternSigCtx = 3 | 760 | 55.1k | { | 761 | 55.1k | 2, 2, 2, 2, | 762 | 55.1k | 2, 2, 2, 2, | 763 | 55.1k | 2, 2, 2, 2, | 764 | 55.1k | 2, 2, 2, 2, | 765 | 55.1k | }, | 766 | | // 4x4 | 767 | 55.1k | { | 768 | 55.1k | 0, 1, 4, 5, | 769 | 55.1k | 2, 3, 4, 5, | 770 | 55.1k | 6, 6, 8, 8, | 771 | 55.1k | 7, 7, 8, 8 | 772 | 55.1k | } | 773 | 55.1k | }; | 774 | | | 775 | | /* iterate over coding groups in reverse scan order */ | 776 | 72.5k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--) | 777 | 17.4k | { | 778 | 17.4k | uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0; | 779 | 17.4k | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; | 780 | 17.4k | const uint32_t cgPosY = cgBlkPos >> log2TrSizeCG; | 781 | 17.4k | const uint32_t cgPosX = cgBlkPos & ((1 << log2TrSizeCG) - 1); | 782 | 17.4k | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); | 783 | 17.4k | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 784 | 17.4k | const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0); | 785 | | | 786 | 17.4k | if (c1 == 0) | 787 | 0 | ctxSet++; | 788 | 17.4k | c1 = 1; | 789 | | | 790 | 17.4k | if (cgScanPos && (coeffNum[cgScanPos] == 0)) | 791 | 0 | { | 792 | | // TODO: does we need zero-coeff cost? | 793 | 0 | const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 794 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 795 | 0 | if (usePsyMask) | 796 | 0 | { | 797 | 0 | #if X265_ARCH_X86 | 798 | 0 | bool enable512 = detect512(); | 799 | 0 | if (enable512) | 800 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 801 | 0 | else | 802 | 0 | { | 803 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 804 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 805 | 0 | } | 806 | | #else | 807 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 808 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 809 | | #endif | 810 | 0 | blkPos = codeParams.scan[scanPosBase]; | 811 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 812 | 0 | { | 813 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 814 | 0 | { | 815 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 816 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 817 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 818 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 819 | |
| 820 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 821 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 822 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 823 | 0 | } | 824 | 0 | blkPos += trSize; | 825 | 0 | } | 826 | 0 | } | 827 | 0 | else | 828 | 0 | { | 829 | | // non-psy path | 830 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 831 | 0 | blkPos = codeParams.scan[scanPosBase]; | 832 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 833 | 0 | { | 834 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 835 | 0 | { | 836 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 837 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 838 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 839 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 840 | |
| 841 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 842 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 843 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 844 | 0 | } | 845 | 0 | blkPos += trSize; | 846 | 0 | } | 847 | 0 | } | 848 | | | 849 | | /* there were no coded coefficients in this coefficient group */ | 850 | 0 | { | 851 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 852 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 853 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 854 | 0 | } | 855 | 0 | continue; | 856 | 0 | } | 857 | | | 858 | 17.4k | coeffGroupRDStats cgRdStats; | 859 | 17.4k | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); | 860 | | | 861 | 17.4k | uint32_t subFlagMask = coeffFlag[cgScanPos]; | 862 | 17.4k | int c2 = 0; | 863 | 17.4k | uint32_t goRiceParam = 0; | 864 | 17.4k | uint32_t levelThreshold = 3; | 865 | 17.4k | uint32_t c1Idx = 0; | 866 | 17.4k | uint32_t c2Idx = 0; | 867 | | /* iterate over coefficients in each group in reverse scan order */ | 868 | 296k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 869 | 278k | { | 870 | 278k | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; | 871 | 278k | uint32_t blkPos = codeParams.scan[scanPos]; | 872 | 278k | uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */ | 873 | 278k | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 874 | 278k | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ | 875 | | | 876 | | /* RDOQ measures distortion as the squared difference between the unquantized coded level | 877 | | * and the original DCT coefficient. The result is shifted scaleBits to account for the | 878 | | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ | 879 | | | 880 | | /* cost of not coding this coefficient (all distortion, no signal bits) */ | 881 | 278k | costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; | 882 | 278k | X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); | 883 | 278k | if (usePsyMask & scanPos) | 884 | | /* when no residual coefficient is coded, predicted coef == recon coef */ | 885 | 96.6k | costUncoded[blkPos] -= PSYVALUE(predictedCoef); | 886 | | | 887 | 278k | totalUncodedCost += costUncoded[blkPos]; | 888 | | | 889 | | // coefficient level estimation | 890 | 278k | const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1]; | 891 | | //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset; | 892 | 278k | static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL}; | 893 | 278k | uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx]; | 894 | 278k | const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset; | 895 | | // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset' | 896 | 278k | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 897 | | | 898 | | // before find lastest non-zero coeff | 899 | 278k | if (scanPos > (uint32_t)lastScanPos) | 900 | 197k | { | 901 | | /* coefficients after lastNZ have no distortion signal cost */ | 902 | 197k | costCoeff[scanPos] = 0; | 903 | 197k | costSig[scanPos] = 0; | 904 | | | 905 | | /* No non-zero coefficient yet found, but this does not mean | 906 | | * there is no uncoded-cost for this coefficient. Pre- | 907 | | * quantization the coefficient may have been non-zero */ | 908 | 197k | totalRdCost += costUncoded[blkPos]; | 909 | 197k | } | 910 | 81.6k | else if (!(subFlagMask & 1)) | 911 | 2.02k | { | 912 | | // fast zero coeff path | 913 | | /* set default costs to uncoded costs */ | 914 | 2.02k | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 915 | 2.02k | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 916 | 2.02k | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 917 | 2.02k | totalRdCost += costCoeff[scanPos]; | 918 | 2.02k | rateIncUp[blkPos] = greaterOneBits[0]; | 919 | | | 920 | 2.02k | subFlagMask >>= 1; | 921 | 2.02k | } | 922 | 79.6k | else | 923 | 79.6k | { | 924 | 79.6k | subFlagMask >>= 1; | 925 | | | 926 | 79.6k | const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; | 927 | 79.6k | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3; // {1, 2, 1, 3} | 928 | | | 929 | 79.6k | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); | 930 | 79.6k | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); | 931 | 79.6k | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); | 932 | 79.6k | X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n"); | 933 | | | 934 | | // coefficient level estimation | 935 | 79.6k | const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2]; | 936 | 79.6k | const uint32_t c1c2Rate = ((c1c2idx & 1) ? greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0); | 937 | | | 938 | 79.6k | uint32_t level = 0; | 939 | 79.6k | uint32_t sigCoefBits = 0; | 940 | 79.6k | costCoeff[scanPos] = MAX_INT64; | 941 | | | 942 | 79.6k | if ((int)scanPos == lastScanPos) | 943 | 17.4k | sigRateDelta[blkPos] = 0; | 944 | 62.1k | else | 945 | 62.1k | { | 946 | 62.1k | if (maxAbsLevel < 3) | 947 | 16.3k | { | 948 | | /* set default costs to uncoded costs */ | 949 | 16.3k | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 950 | 16.3k | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 951 | 16.3k | } | 952 | 62.1k | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 953 | 62.1k | sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; | 954 | 62.1k | } | 955 | | | 956 | 79.6k | const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound); | 957 | | // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1) | (0 < X < 2 ==> X=1) | 958 | 79.6k | if (maxAbsLevel == 1) | 959 | 15.3k | { | 960 | 15.3k | uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE; | 961 | 15.3k | X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n"); | 962 | | | 963 | 15.3k | int unquantAbsLevel = unQuantLevel >> unquantShift; | 964 | 15.3k | X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n"); | 965 | 15.3k | int d = abs(signCoef) - unquantAbsLevel; | 966 | 15.3k | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); | 967 | | | 968 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 969 | 15.3k | if (usePsyMask & scanPos) | 970 | 12.9k | { | 971 | 12.9k | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); | 972 | 12.9k | curCost -= PSYVALUE(reconCoef); | 973 | 12.9k | } | 974 | | | 975 | 15.3k | if (curCost < costCoeff[scanPos]) | 976 | 14.3k | { | 977 | 14.3k | level = 1; | 978 | 14.3k | costCoeff[scanPos] = curCost; | 979 | 14.3k | costSig[scanPos] = SIGCOST(sigCoefBits); | 980 | 14.3k | } | 981 | 15.3k | } | 982 | 64.3k | else if (maxAbsLevel) | 983 | 64.3k | { | 984 | 64.3k | uint32_t levelBits0 = getICRateCost(maxAbsLevel, maxAbsLevel - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 985 | 64.3k | uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 986 | | | 987 | 64.3k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 988 | | | 989 | 64.3k | const int unquantAbsLevel0 = unQuantLevel >> unquantShift; | 990 | 64.3k | X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n"); | 991 | 64.3k | int d0 = abs(signCoef) - unquantAbsLevel0; | 992 | 64.3k | int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0); | 993 | | | 994 | 64.3k | const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift; | 995 | 64.3k | X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n"); | 996 | 64.3k | int d1 = abs(signCoef) - unquantAbsLevel1; | 997 | 64.3k | int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1); | 998 | | | 999 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 1000 | 64.3k | if (usePsyMask & scanPos) | 1001 | 49.3k | { | 1002 | 49.3k | int reconCoef; | 1003 | 49.3k | reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef)); | 1004 | 49.3k | curCost0 -= PSYVALUE(reconCoef); | 1005 | | | 1006 | 49.3k | reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef)); | 1007 | 49.3k | curCost1 -= PSYVALUE(reconCoef); | 1008 | 49.3k | } | 1009 | 64.3k | if (curCost0 < costCoeff[scanPos]) | 1010 | 64.3k | { | 1011 | 64.3k | level = maxAbsLevel; | 1012 | 64.3k | costCoeff[scanPos] = curCost0; | 1013 | 64.3k | costSig[scanPos] = SIGCOST(sigCoefBits); | 1014 | 64.3k | } | 1015 | 64.3k | if (curCost1 < costCoeff[scanPos]) | 1016 | 2.16k | { | 1017 | 2.16k | level = maxAbsLevel - 1; | 1018 | 2.16k | costCoeff[scanPos] = curCost1; | 1019 | 2.16k | costSig[scanPos] = SIGCOST(sigCoefBits); | 1020 | 2.16k | } | 1021 | 64.3k | } | 1022 | | | 1023 | 79.6k | dstCoeff[blkPos] = (int16_t)level; | 1024 | 79.6k | totalRdCost += costCoeff[scanPos]; | 1025 | | | 1026 | | /* record costs for sign-hiding performed at the end */ | 1027 | 79.6k | if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level) | 1028 | 78.6k | { | 1029 | 78.6k | const int32_t diff0 = level - 1 - baseLevel; | 1030 | 78.6k | const int32_t diff2 = level + 1 - baseLevel; | 1031 | 78.6k | const int32_t maxVlc = g_goRiceRange[goRiceParam]; | 1032 | 78.6k | int rate0, rate1, rate2; | 1033 | | | 1034 | 78.6k | if (diff0 < -2) // prob (92.9, 86.5, 74.5)% | 1035 | 13.6k | { | 1036 | | // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} | 1037 | | // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 | 1038 | 13.6k | X265_CHECK(level == 1, "absLevel check failure\n"); | 1039 | | | 1040 | 13.6k | const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; | 1041 | 13.6k | const int rateNotEqual2 = greaterOneBits[0]; | 1042 | | | 1043 | 13.6k | rate0 = 0; | 1044 | 13.6k | rate2 = rateEqual2; | 1045 | 13.6k | rate1 = rateNotEqual2; | 1046 | | | 1047 | 13.6k | X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1048 | 13.6k | X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1049 | 13.6k | X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1050 | 13.6k | } | 1051 | 64.9k | else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% | 1052 | 34.1k | { | 1053 | | // NOTE: no c1c2 correct rate since all of rate include this factor | 1054 | 34.1k | rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); | 1055 | 34.1k | rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); | 1056 | 34.1k | rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); | 1057 | 34.1k | } | 1058 | 30.8k | else | 1059 | 30.8k | { | 1060 | 30.8k | rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1061 | 30.8k | rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1062 | 30.8k | rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1063 | 30.8k | } | 1064 | 78.6k | rateIncUp[blkPos] = rate2 - rate1; | 1065 | 78.6k | rateIncDown[blkPos] = rate0 - rate1; | 1066 | 78.6k | } | 1067 | 968 | else | 1068 | 968 | { | 1069 | 968 | rateIncUp[blkPos] = greaterOneBits[0]; | 1070 | 968 | rateIncDown[blkPos] = 0; | 1071 | 968 | } | 1072 | | | 1073 | | /* Update CABAC estimation state */ | 1074 | 79.6k | if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold)) | 1075 | 26.7k | { | 1076 | 26.7k | goRiceParam++; | 1077 | 26.7k | levelThreshold <<= 1; | 1078 | 26.7k | } | 1079 | | | 1080 | 79.6k | const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31; | 1081 | 79.6k | c1Idx += isNonZero; | 1082 | | | 1083 | | /* update bin model */ | 1084 | 79.6k | if (level > 1) | 1085 | 62.9k | { | 1086 | 62.9k | c1 = 0; | 1087 | 62.9k | c2 += (uint32_t)(c2 - 2) >> 31; | 1088 | 62.9k | c2Idx++; | 1089 | 62.9k | } | 1090 | 16.6k | else if (((c1 == 1) | (c1 == 2)) & isNonZero) | 1091 | 9.61k | c1++; | 1092 | | | 1093 | 79.6k | if (dstCoeff[blkPos]) | 1094 | 78.6k | { | 1095 | 78.6k | sigCoeffGroupFlag64 |= cgBlkPosMask; | 1096 | 78.6k | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; | 1097 | 78.6k | cgRdStats.uncodedDist += costUncoded[blkPos]; | 1098 | 78.6k | cgRdStats.nnzBeforePos0 += scanPosinCG; | 1099 | 78.6k | } | 1100 | 79.6k | } | 1101 | | | 1102 | 278k | cgRdStats.sigCost += costSig[scanPos]; | 1103 | 278k | } /* end for (scanPosinCG) */ | 1104 | | | 1105 | 17.4k | X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n"); | 1106 | 17.4k | cgRdStats.sigCost0 = costSig[scanPos]; | 1107 | | | 1108 | 17.4k | costCoeffGroupSig[cgScanPos] = 0; | 1109 | | | 1110 | | /* nothing to do at this case */ | 1111 | 17.4k | X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n"); | 1112 | | | 1113 | 17.4k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1114 | 17.4k | { | 1115 | | /* coeff group 0 is implied to be present, no signal cost */ | 1116 | | /* coeff group with last NZ is implied to be present, handled below */ | 1117 | 17.4k | } | 1118 | 0 | else if (sigCoeffGroupFlag64 & cgBlkPosMask) | 1119 | 0 | { | 1120 | 0 | if (!cgRdStats.nnzBeforePos0) | 1121 | 0 | { | 1122 | | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ | 1123 | 0 | totalRdCost -= cgRdStats.sigCost0; | 1124 | 0 | cgRdStats.sigCost -= cgRdStats.sigCost0; | 1125 | 0 | } | 1126 | | | 1127 | | /* there are coded coefficients in this group, but now we include the signaling cost | 1128 | | * of the significant coefficient group flag and evaluate whether the RD cost of the | 1129 | | * coded group is more than the RD cost of the uncoded group */ | 1130 | |
| 1131 | 0 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1132 | |
| 1133 | 0 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1134 | 0 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ | 1135 | 0 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ | 1136 | 0 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ | 1137 | |
| 1138 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); | 1139 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ | 1140 | |
| 1141 | 0 | if (costZeroCG < totalRdCost && m_rdoqLevel > 1) | 1142 | 0 | { | 1143 | 0 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; | 1144 | 0 | totalRdCost = costZeroCG; | 1145 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1146 | | | 1147 | | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ | 1148 | 0 | const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize]; | 1149 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1150 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1151 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1152 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1153 | 0 | } | 1154 | 0 | } | 1155 | 0 | else | 1156 | 0 | { | 1157 | | /* there were no coded coefficients in this coefficient group */ | 1158 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1159 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 1160 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 1161 | 0 | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ | 1162 | 0 | } | 1163 | 17.4k | } /* end for (cgScanPos) */ | 1164 | | | 1165 | 55.1k | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); | 1166 | | | 1167 | | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ | 1168 | 55.1k | int64_t bestCost; | 1169 | 55.1k | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) | 1170 | 0 | { | 1171 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); | 1172 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); | 1173 | 0 | } | 1174 | 55.1k | else | 1175 | 55.1k | { | 1176 | 55.1k | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; | 1177 | 55.1k | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); | 1178 | 55.1k | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); | 1179 | 55.1k | } | 1180 | | | 1181 | | /* This loop starts with the last non-zero found in the first loop and then refines this last | 1182 | | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs | 1183 | | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out | 1184 | | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty | 1185 | | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ | 1186 | 55.1k | int bestLastIdx = 0; | 1187 | 55.1k | bool foundLast = false; | 1188 | 72.5k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) | 1189 | 17.4k | { | 1190 | 17.4k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1191 | 17.4k | { | 1192 | | /* the presence of these coefficient groups are inferred, they have no bit in | 1193 | | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ | 1194 | 17.4k | } | 1195 | 0 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) | 1196 | 0 | { | 1197 | | /* remove cost of significant coeff group flag, the group's presence would be inferred | 1198 | | * from lastNZ if it were present in this group */ | 1199 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1200 | 0 | } | 1201 | 0 | else | 1202 | 0 | { | 1203 | | /* remove cost of signaling this empty group as not present */ | 1204 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1205 | 0 | continue; | 1206 | 0 | } | 1207 | | | 1208 | 230k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 1209 | 227k | { | 1210 | 227k | scanPos = cgScanPos * cgSize + scanPosinCG; | 1211 | 227k | if ((int)scanPos > lastScanPos) | 1212 | 197k | continue; | 1213 | | | 1214 | | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then | 1215 | | * continue as if it were uncoded. If the coefficient was already uncoded, remove the | 1216 | | * cost of signaling it as not-significant */ | 1217 | 30.6k | uint32_t blkPos = codeParams.scan[scanPos]; | 1218 | 30.6k | if (dstCoeff[blkPos]) | 1219 | 28.6k | { | 1220 | | // Calculates the cost of signaling the last significant coefficient in the block | 1221 | 28.6k | uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) }; | 1222 | 28.6k | if (codeParams.scanType == SCAN_VER) | 1223 | 4.09k | std::swap(pos[0], pos[1]); | 1224 | 28.6k | uint32_t bitsLastNZ = 0; | 1225 | | | 1226 | 85.8k | for (int i = 0; i < 2; i++) | 1227 | 57.2k | { | 1228 | 57.2k | int temp = g_lastCoeffTable[pos[i]]; | 1229 | 57.2k | int prefixOnes = temp & 15; | 1230 | 57.2k | int suffixLen = temp >> 4; | 1231 | | | 1232 | 57.2k | bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes]; | 1233 | 57.2k | bitsLastNZ += IEP_RATE * suffixLen; | 1234 | 57.2k | } | 1235 | | | 1236 | 28.6k | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); | 1237 | | | 1238 | 28.6k | if (costAsLast < bestCost) | 1239 | 18.1k | { | 1240 | 18.1k | bestLastIdx = scanPos + 1; | 1241 | 18.1k | bestCost = costAsLast; | 1242 | 18.1k | } | 1243 | 28.6k | if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) | 1244 | 14.9k | { | 1245 | 14.9k | foundLast = true; | 1246 | 14.9k | break; | 1247 | 14.9k | } | 1248 | | | 1249 | 13.6k | totalRdCost -= costCoeff[scanPos]; | 1250 | 13.6k | totalRdCost += costUncoded[blkPos]; | 1251 | 13.6k | } | 1252 | 2.01k | else | 1253 | 2.01k | totalRdCost -= costSig[scanPos]; | 1254 | 30.6k | } | 1255 | 17.4k | } | 1256 | | | 1257 | | /* recount non-zero coefficients and re-apply sign of DCT coef */ | 1258 | 55.1k | numSig = 0; | 1259 | 132k | for (int pos = 0; pos < bestLastIdx; pos++) | 1260 | 77.5k | { | 1261 | 77.5k | int blkPos = codeParams.scan[pos]; | 1262 | 77.5k | int level = dstCoeff[blkPos]; | 1263 | 77.5k | numSig += (level != 0); | 1264 | | | 1265 | 77.5k | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; | 1266 | 77.5k | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); | 1267 | 77.5k | } | 1268 | | | 1269 | | // Average 49.62 pixels | 1270 | | /* clean uncoded coefficients */ | 1271 | 55.1k | X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n"); | 1272 | 256k | for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++) | 1273 | 201k | { | 1274 | 201k | dstCoeff[codeParams.scan[pos]] = 0; | 1275 | 201k | } | 1276 | 55.1k | for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE) | 1277 | 0 | { | 1278 | 0 | const uint32_t blkPos = codeParams.scan[pos]; | 1279 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1280 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1281 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1282 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1283 | 0 | } | 1284 | | | 1285 | | /* rate-distortion based sign-hiding */ | 1286 | 55.1k | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) | 1287 | 4.99k | { | 1288 | 4.99k | const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE; | 1289 | 4.99k | int lastCG = 1; | 1290 | | | 1291 | 9.98k | for (int subSet = realLastScanPos; subSet >= 0; subSet--) | 1292 | 4.99k | { | 1293 | 4.99k | int subPos = subSet << LOG2_SCAN_SET_SIZE; | 1294 | 4.99k | int n; | 1295 | | | 1296 | 4.99k | if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet]))) | 1297 | 0 | continue; | 1298 | | | 1299 | | /* measure distance between first and last non-zero coef in this | 1300 | | * coding group */ | 1301 | 4.99k | const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]); | 1302 | 4.99k | const int firstNZPosInCG = (uint8_t)posFirstLast; | 1303 | 4.99k | const int lastNZPosInCG = (int8_t)(posFirstLast >> 8); | 1304 | 4.99k | const uint32_t absSumSign = posFirstLast; | 1305 | | | 1306 | 4.99k | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | 1307 | 4.86k | { | 1308 | 4.86k | const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]); | 1309 | | | 1310 | | #if CHECKED_BUILD || _DEBUG | 1311 | | int32_t absSum_dummy = 0; | 1312 | | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | 1313 | | absSum_dummy += dstCoeff[codeParams.scan[n + subPos]]; | 1314 | | X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n"); | 1315 | | #endif | 1316 | | | 1317 | | //if (signbit != absSumSign) | 1318 | 4.86k | if (((int32_t)(signbit ^ absSumSign)) < 0) | 1319 | 2.27k | { | 1320 | | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff | 1321 | | * is properly implied. Note dstCoeff[] are signed by this point but curChange and | 1322 | | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ | 1323 | | | 1324 | 2.27k | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; | 1325 | 2.27k | uint32_t minPos = 0; | 1326 | 2.27k | int8_t finalChange = 0; | 1327 | 2.27k | int curChange = 0; | 1328 | 2.27k | uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE; | 1329 | | | 1330 | 32.0k | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | 1331 | 29.7k | { | 1332 | 29.7k | const uint32_t blkPos = codeParams.scan[n + subPos]; | 1333 | 29.7k | const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 1334 | 29.7k | const int absLevel = abs(dstCoeff[blkPos]); | 1335 | | // TODO: this is constant in non-scaling mode | 1336 | 29.7k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 1337 | 29.7k | const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound); | 1338 | | | 1339 | 29.7k | int d = abs(signCoef) - (unQuantLevel >> unquantShift); | 1340 | 29.7k | X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n"); | 1341 | | | 1342 | 29.7k | const int64_t origDist = (((int64_t)d * d)); | 1343 | | | 1344 | 29.7k | #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8)) | 1345 | | | 1346 | 29.7k | const uint32_t isOne = (absLevel == 1); | 1347 | 29.7k | if (dstCoeff[blkPos]) | 1348 | 28.6k | { | 1349 | 28.6k | d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift); | 1350 | 28.6k | X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1351 | 28.6k | int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]); | 1352 | | | 1353 | | /* if decrementing would make the coeff 0, we can include the | 1354 | | * significant coeff flag cost savings */ | 1355 | 28.6k | d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift); | 1356 | 28.6k | X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1357 | 28.6k | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); | 1358 | 28.6k | int64_t costDown = DELTARDCOST(origDist, d, downBits); | 1359 | | | 1360 | 28.6k | costDown -= lastCoeffAdjust; | 1361 | 28.6k | curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown; | 1362 | | | 1363 | 28.6k | curChange = 2 * (costUp < costDown) - 1; | 1364 | 28.6k | curCost = (costUp < costDown) ? costUp : curCost; | 1365 | 28.6k | } | 1366 | | //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31))) | 1367 | 1.06k | else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0)) | 1368 | 0 | { | 1369 | | /* don't try to make a new coded coeff before the first coeff if its | 1370 | | * sign would be different than the first coeff, the inferred sign would | 1371 | | * still be wrong and we'd have to do this again. */ | 1372 | 0 | curCost = MAX_INT64; | 1373 | 0 | } | 1374 | 1.06k | else | 1375 | 1.06k | { | 1376 | | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ | 1377 | 1.06k | d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift); | 1378 | 1.06k | X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n"); | 1379 | 1.06k | curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); | 1380 | 1.06k | curChange = 1; | 1381 | 1.06k | } | 1382 | | | 1383 | 29.7k | if (curCost < minCostInc) | 1384 | 7.87k | { | 1385 | 7.87k | minCostInc = curCost; | 1386 | 7.87k | finalChange = (int8_t)curChange; | 1387 | 7.87k | minPos = blkPos + (absLevel << 16); | 1388 | 7.87k | } | 1389 | 29.7k | lastCoeffAdjust = 0; | 1390 | 29.7k | } | 1391 | | | 1392 | 2.27k | const int absInMinPos = (minPos >> 16); | 1393 | 2.27k | minPos = (uint16_t)minPos; | 1394 | | | 1395 | | // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) | 1396 | 2.27k | if (absInMinPos >= 32767) | 1397 | | /* don't allow sign hiding to violate the SPEC range */ | 1398 | 0 | finalChange = -1; | 1399 | | | 1400 | | // NOTE: Reference code | 1401 | | //if (dstCoeff[minPos] == 0) | 1402 | | // numSig++; | 1403 | | //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) | 1404 | | // numSig--; | 1405 | 2.27k | numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1)); | 1406 | | | 1407 | | | 1408 | | // NOTE: Reference code | 1409 | | //if (m_resiDctCoeff[minPos] >= 0) | 1410 | | // dstCoeff[minPos] += finalChange; | 1411 | | //else | 1412 | | // dstCoeff[minPos] -= finalChange; | 1413 | 2.27k | const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16); | 1414 | 2.27k | dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign); | 1415 | 2.27k | } | 1416 | 4.86k | } | 1417 | | | 1418 | 4.99k | lastCG = 0; | 1419 | 4.99k | } | 1420 | 4.99k | } | 1421 | | | 1422 | 55.1k | return numSig; | 1423 | 7.45M | } |
unsigned int x265::Quant::rdoQuant<3u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Line | Count | Source | 611 | 1.25M | { | 612 | 1.25M | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ | 613 | 1.25M | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; | 614 | 1.25M | const uint32_t usePsyMask = usePsy ? -1 : 0; | 615 | | | 616 | 1.25M | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); | 617 | | | 618 | 1.25M | int rem = m_qpParam[ttype].rem; | 619 | 1.25M | int per = m_qpParam[ttype].per; | 620 | 1.25M | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ | 621 | 1.25M | int add = (1 << (qbits - 1)); | 622 | 1.25M | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; | 623 | | | 624 | 1.25M | const int numCoeff = 1 << (log2TrSize * 2); | 625 | 1.25M | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); | 626 | 1.25M | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); | 627 | 1.25M | if (!numSig) | 628 | 1.23M | return 0; | 629 | 20.6k | const uint32_t trSize = 1 << log2TrSize; | 630 | 20.6k | int64_t lambda2 = m_qpParam[ttype].lambda2; | 631 | 20.6k | int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); | 632 | | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) | 633 | | * scale applied that must be removed during unquant. Note that in real dequant there is clipping | 634 | | * at several stages. We skip the clipping for simplicity when measuring RD cost */ | 635 | 20.6k | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; | 636 | 20.6k | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); | 637 | 20.6k | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; | 638 | 20.6k | const int scaleBits = SCALE_BITS - 2 * transformShift; | 639 | | | 640 | 20.6k | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) | 641 | 20.6k | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) | 642 | 20.6k | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) | 643 | 20.6k | #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) | 644 | | | 645 | 20.6k | int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ | 646 | 20.6k | int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ | 647 | 20.6k | int64_t costSig[trSize * trSize]; /* lambda * bits */ | 648 | | | 649 | 20.6k | int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ | 650 | 20.6k | int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */ | 651 | 20.6k | int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ | 652 | | | 653 | 20.6k | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ | 654 | 20.6k | uint64_t sigCoeffGroupFlag64 = 0; | 655 | | | 656 | 20.6k | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ | 657 | 20.6k | bool bIsLuma = ttype == TEXT_LUMA; | 658 | | | 659 | | /* total rate distortion cost of transform block, as CBF=0 */ | 660 | 20.6k | int64_t totalUncodedCost = 0; | 661 | | | 662 | | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, | 663 | | * the distortion and signal cost of coded blocks, and the coding cost of significant | 664 | | * coefficient and coefficient group bitmaps */ | 665 | 20.6k | int64_t totalRdCost = 0; | 666 | | | 667 | 20.6k | TUEntropyCodingParameters codeParams; | 668 | 20.6k | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); | 669 | 20.6k | const uint32_t log2TrSizeCG = log2TrSize - 2; | 670 | 20.6k | const uint32_t cgNum = 1 << (log2TrSizeCG * 2); | 671 | 20.6k | const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE); | 672 | | | 673 | 20.6k | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] | 674 | 20.6k | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign | 675 | 20.6k | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff | 676 | | | 677 | | #if CHECKED_BUILD || _DEBUG | 678 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group | 679 | | memset(coeffNum, 0, sizeof(coeffNum)); | 680 | | memset(coeffSign, 0, sizeof(coeffNum)); | 681 | | memset(coeffFlag, 0, sizeof(coeffNum)); | 682 | | #endif | 683 | 20.6k | const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); | 684 | 20.6k | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); | 685 | | | 686 | | | 687 | | /* TODO: update bit estimates if dirty */ | 688 | 20.6k | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; | 689 | | | 690 | 20.6k | uint32_t scanPos = 0; | 691 | 20.6k | uint32_t c1 = 1; | 692 | | | 693 | | // process trail all zero Coeff Group | 694 | | | 695 | | /* coefficients after lastNZ have no distortion signal cost */ | 696 | 20.6k | const int zeroCG = cgNum - 1 - cgLastScanPos; | 697 | 20.6k | memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 698 | 20.6k | memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 699 | | | 700 | | /* sum zero coeff (uncodec) cost */ | 701 | | | 702 | | // TODO: does we need these cost? | 703 | 20.6k | if (usePsyMask) | 704 | 5.12k | { | 705 | 20.4k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 706 | 15.3k | { | 707 | 15.3k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 708 | 15.3k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 709 | 15.3k | uint32_t blkPos = codeParams.scan[scanPosBase]; | 710 | 15.3k | #if X265_ARCH_X86 | 711 | 15.3k | bool enable512 = detect512(); | 712 | 15.3k | if (enable512) | 713 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 714 | 15.3k | else | 715 | 15.3k | { | 716 | 15.3k | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos); | 717 | 15.3k | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 718 | 15.3k | } | 719 | | #else | 720 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 721 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 722 | | #endif | 723 | 15.3k | } | 724 | 5.12k | } | 725 | 15.5k | else | 726 | 15.5k | { | 727 | | // non-psy path | 728 | 60.7k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 729 | 45.1k | { | 730 | 45.1k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 731 | 45.1k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 732 | 45.1k | uint32_t blkPos = codeParams.scan[scanPosBase]; | 733 | 45.1k | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 734 | 45.1k | } | 735 | 15.5k | } | 736 | 20.6k | static const uint8_t table_cnt[5][SCAN_SET_SIZE] = | 737 | 20.6k | { | 738 | | // patternSigCtx = 0 | 739 | 20.6k | { | 740 | 20.6k | 2, 1, 1, 0, | 741 | 20.6k | 1, 1, 0, 0, | 742 | 20.6k | 1, 0, 0, 0, | 743 | 20.6k | 0, 0, 0, 0, | 744 | 20.6k | }, | 745 | | // patternSigCtx = 1 | 746 | 20.6k | { | 747 | 20.6k | 2, 2, 2, 2, | 748 | 20.6k | 1, 1, 1, 1, | 749 | 20.6k | 0, 0, 0, 0, | 750 | 20.6k | 0, 0, 0, 0, | 751 | 20.6k | }, | 752 | | // patternSigCtx = 2 | 753 | 20.6k | { | 754 | 20.6k | 2, 1, 0, 0, | 755 | 20.6k | 2, 1, 0, 0, | 756 | 20.6k | 2, 1, 0, 0, | 757 | 20.6k | 2, 1, 0, 0, | 758 | 20.6k | }, | 759 | | // patternSigCtx = 3 | 760 | 20.6k | { | 761 | 20.6k | 2, 2, 2, 2, | 762 | 20.6k | 2, 2, 2, 2, | 763 | 20.6k | 2, 2, 2, 2, | 764 | 20.6k | 2, 2, 2, 2, | 765 | 20.6k | }, | 766 | | // 4x4 | 767 | 20.6k | { | 768 | 20.6k | 0, 1, 4, 5, | 769 | 20.6k | 2, 3, 4, 5, | 770 | 20.6k | 6, 6, 8, 8, | 771 | 20.6k | 7, 7, 8, 8 | 772 | 20.6k | } | 773 | 20.6k | }; | 774 | | | 775 | | /* iterate over coding groups in reverse scan order */ | 776 | 40.8k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--) | 777 | 20.1k | { | 778 | 20.1k | uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0; | 779 | 20.1k | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; | 780 | 20.1k | const uint32_t cgPosY = cgBlkPos >> log2TrSizeCG; | 781 | 20.1k | const uint32_t cgPosX = cgBlkPos & ((1 << log2TrSizeCG) - 1); | 782 | 20.1k | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); | 783 | 20.1k | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 784 | 20.1k | const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0); | 785 | | | 786 | 20.1k | if (c1 == 0) | 787 | 0 | ctxSet++; | 788 | 20.1k | c1 = 1; | 789 | | | 790 | 20.1k | if (cgScanPos && (coeffNum[cgScanPos] == 0)) | 791 | 0 | { | 792 | | // TODO: does we need zero-coeff cost? | 793 | 0 | const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 794 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 795 | 0 | if (usePsyMask) | 796 | 0 | { | 797 | 0 | #if X265_ARCH_X86 | 798 | 0 | bool enable512 = detect512(); | 799 | 0 | if (enable512) | 800 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 801 | 0 | else | 802 | 0 | { | 803 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 804 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 805 | 0 | } | 806 | | #else | 807 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 808 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 809 | | #endif | 810 | 0 | blkPos = codeParams.scan[scanPosBase]; | 811 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 812 | 0 | { | 813 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 814 | 0 | { | 815 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 816 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 817 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 818 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 819 | |
| 820 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 821 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 822 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 823 | 0 | } | 824 | 0 | blkPos += trSize; | 825 | 0 | } | 826 | 0 | } | 827 | 0 | else | 828 | 0 | { | 829 | | // non-psy path | 830 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 831 | 0 | blkPos = codeParams.scan[scanPosBase]; | 832 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 833 | 0 | { | 834 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 835 | 0 | { | 836 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 837 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 838 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 839 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 840 | |
| 841 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 842 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 843 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 844 | 0 | } | 845 | 0 | blkPos += trSize; | 846 | 0 | } | 847 | 0 | } | 848 | | | 849 | | /* there were no coded coefficients in this coefficient group */ | 850 | 0 | { | 851 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 852 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 853 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 854 | 0 | } | 855 | 0 | continue; | 856 | 0 | } | 857 | | | 858 | 20.1k | coeffGroupRDStats cgRdStats; | 859 | 20.1k | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); | 860 | | | 861 | 20.1k | uint32_t subFlagMask = coeffFlag[cgScanPos]; | 862 | 20.1k | int c2 = 0; | 863 | 20.1k | uint32_t goRiceParam = 0; | 864 | 20.1k | uint32_t levelThreshold = 3; | 865 | 20.1k | uint32_t c1Idx = 0; | 866 | 20.1k | uint32_t c2Idx = 0; | 867 | | /* iterate over coefficients in each group in reverse scan order */ | 868 | 343k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 869 | 322k | { | 870 | 322k | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; | 871 | 322k | uint32_t blkPos = codeParams.scan[scanPos]; | 872 | 322k | uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */ | 873 | 322k | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 874 | 322k | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ | 875 | | | 876 | | /* RDOQ measures distortion as the squared difference between the unquantized coded level | 877 | | * and the original DCT coefficient. The result is shifted scaleBits to account for the | 878 | | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ | 879 | | | 880 | | /* cost of not coding this coefficient (all distortion, no signal bits) */ | 881 | 322k | costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; | 882 | 322k | X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); | 883 | 322k | if (usePsyMask & scanPos) | 884 | | /* when no residual coefficient is coded, predicted coef == recon coef */ | 885 | 76.8k | costUncoded[blkPos] -= PSYVALUE(predictedCoef); | 886 | | | 887 | 322k | totalUncodedCost += costUncoded[blkPos]; | 888 | | | 889 | | // coefficient level estimation | 890 | 322k | const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1]; | 891 | | //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset; | 892 | 322k | static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL}; | 893 | 322k | uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx]; | 894 | 322k | const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset; | 895 | | // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset' | 896 | 322k | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 897 | | | 898 | | // before find lastest non-zero coeff | 899 | 322k | if (scanPos > (uint32_t)lastScanPos) | 900 | 302k | { | 901 | | /* coefficients after lastNZ have no distortion signal cost */ | 902 | 302k | costCoeff[scanPos] = 0; | 903 | 302k | costSig[scanPos] = 0; | 904 | | | 905 | | /* No non-zero coefficient yet found, but this does not mean | 906 | | * there is no uncoded-cost for this coefficient. Pre- | 907 | | * quantization the coefficient may have been non-zero */ | 908 | 302k | totalRdCost += costUncoded[blkPos]; | 909 | 302k | } | 910 | 20.1k | else if (!(subFlagMask & 1)) | 911 | 0 | { | 912 | | // fast zero coeff path | 913 | | /* set default costs to uncoded costs */ | 914 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 915 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 916 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 917 | 0 | totalRdCost += costCoeff[scanPos]; | 918 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; | 919 | |
| 920 | 0 | subFlagMask >>= 1; | 921 | 0 | } | 922 | 20.1k | else | 923 | 20.1k | { | 924 | 20.1k | subFlagMask >>= 1; | 925 | | | 926 | 20.1k | const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; | 927 | 20.1k | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3; // {1, 2, 1, 3} | 928 | | | 929 | 20.1k | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); | 930 | 20.1k | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); | 931 | 20.1k | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); | 932 | 20.1k | X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n"); | 933 | | | 934 | | // coefficient level estimation | 935 | 20.1k | const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2]; | 936 | 18.4E | const uint32_t c1c2Rate = ((c1c2idx & 1) ? greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0); | 937 | | | 938 | 20.1k | uint32_t level = 0; | 939 | 20.1k | uint32_t sigCoefBits = 0; | 940 | 20.1k | costCoeff[scanPos] = MAX_INT64; | 941 | | | 942 | 20.1k | if ((int)scanPos == lastScanPos) | 943 | 20.1k | sigRateDelta[blkPos] = 0; | 944 | 18.4E | else | 945 | 18.4E | { | 946 | 18.4E | if (maxAbsLevel < 3) | 947 | 0 | { | 948 | | /* set default costs to uncoded costs */ | 949 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 950 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 951 | 0 | } | 952 | 18.4E | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 953 | 18.4E | sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; | 954 | 18.4E | } | 955 | | | 956 | 20.1k | const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound); | 957 | | // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1) | (0 < X < 2 ==> X=1) | 958 | 20.1k | if (maxAbsLevel == 1) | 959 | 10.7k | { | 960 | 10.7k | uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE; | 961 | 10.7k | X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n"); | 962 | | | 963 | 10.7k | int unquantAbsLevel = unQuantLevel >> unquantShift; | 964 | 10.7k | X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n"); | 965 | 10.7k | int d = abs(signCoef) - unquantAbsLevel; | 966 | 10.7k | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); | 967 | | | 968 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 969 | 10.7k | if (usePsyMask & scanPos) | 970 | 0 | { | 971 | 0 | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); | 972 | 0 | curCost -= PSYVALUE(reconCoef); | 973 | 0 | } | 974 | | | 975 | 10.7k | if (curCost < costCoeff[scanPos]) | 976 | 10.7k | { | 977 | 10.7k | level = 1; | 978 | 10.7k | costCoeff[scanPos] = curCost; | 979 | 10.7k | costSig[scanPos] = SIGCOST(sigCoefBits); | 980 | 10.7k | } | 981 | 10.7k | } | 982 | 9.41k | else if (maxAbsLevel) | 983 | 9.42k | { | 984 | 9.42k | uint32_t levelBits0 = getICRateCost(maxAbsLevel, maxAbsLevel - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 985 | 9.42k | uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 986 | | | 987 | 9.42k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 988 | | | 989 | 9.42k | const int unquantAbsLevel0 = unQuantLevel >> unquantShift; | 990 | 9.42k | X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n"); | 991 | 9.42k | int d0 = abs(signCoef) - unquantAbsLevel0; | 992 | 9.42k | int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0); | 993 | | | 994 | 9.42k | const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift; | 995 | 9.42k | X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n"); | 996 | 9.42k | int d1 = abs(signCoef) - unquantAbsLevel1; | 997 | 9.42k | int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1); | 998 | | | 999 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 1000 | 9.42k | if (usePsyMask & scanPos) | 1001 | 0 | { | 1002 | 0 | int reconCoef; | 1003 | 0 | reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef)); | 1004 | 0 | curCost0 -= PSYVALUE(reconCoef); | 1005 | |
| 1006 | 0 | reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef)); | 1007 | 0 | curCost1 -= PSYVALUE(reconCoef); | 1008 | 0 | } | 1009 | 9.42k | if (curCost0 < costCoeff[scanPos]) | 1010 | 9.42k | { | 1011 | 9.42k | level = maxAbsLevel; | 1012 | 9.42k | costCoeff[scanPos] = curCost0; | 1013 | 9.42k | costSig[scanPos] = SIGCOST(sigCoefBits); | 1014 | 9.42k | } | 1015 | 9.42k | if (curCost1 < costCoeff[scanPos]) | 1016 | 0 | { | 1017 | 0 | level = maxAbsLevel - 1; | 1018 | 0 | costCoeff[scanPos] = curCost1; | 1019 | 0 | costSig[scanPos] = SIGCOST(sigCoefBits); | 1020 | 0 | } | 1021 | 9.42k | } | 1022 | | | 1023 | 20.1k | dstCoeff[blkPos] = (int16_t)level; | 1024 | 20.1k | totalRdCost += costCoeff[scanPos]; | 1025 | | | 1026 | | /* record costs for sign-hiding performed at the end */ | 1027 | 18.4E | if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level) | 1028 | 20.1k | { | 1029 | 20.1k | const int32_t diff0 = level - 1 - baseLevel; | 1030 | 20.1k | const int32_t diff2 = level + 1 - baseLevel; | 1031 | 20.1k | const int32_t maxVlc = g_goRiceRange[goRiceParam]; | 1032 | 20.1k | int rate0, rate1, rate2; | 1033 | | | 1034 | 20.1k | if (diff0 < -2) // prob (92.9, 86.5, 74.5)% | 1035 | 10.7k | { | 1036 | | // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} | 1037 | | // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 | 1038 | 10.7k | X265_CHECK(level == 1, "absLevel check failure\n"); | 1039 | | | 1040 | 10.7k | const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; | 1041 | 10.7k | const int rateNotEqual2 = greaterOneBits[0]; | 1042 | | | 1043 | 10.7k | rate0 = 0; | 1044 | 10.7k | rate2 = rateEqual2; | 1045 | 10.7k | rate1 = rateNotEqual2; | 1046 | | | 1047 | 10.7k | X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1048 | 10.7k | X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1049 | 10.7k | X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1050 | 10.7k | } | 1051 | 9.42k | else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% | 1052 | 0 | { | 1053 | | // NOTE: no c1c2 correct rate since all of rate include this factor | 1054 | 0 | rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); | 1055 | 0 | rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); | 1056 | 0 | rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); | 1057 | 0 | } | 1058 | 9.42k | else | 1059 | 9.42k | { | 1060 | 9.42k | rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1061 | 9.42k | rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1062 | 9.42k | rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1063 | 9.42k | } | 1064 | 20.1k | rateIncUp[blkPos] = rate2 - rate1; | 1065 | 20.1k | rateIncDown[blkPos] = rate0 - rate1; | 1066 | 20.1k | } | 1067 | 18.4E | else | 1068 | 18.4E | { | 1069 | 18.4E | rateIncUp[blkPos] = greaterOneBits[0]; | 1070 | 18.4E | rateIncDown[blkPos] = 0; | 1071 | 18.4E | } | 1072 | | | 1073 | | /* Update CABAC estimation state */ | 1074 | 20.1k | if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold)) | 1075 | 9.42k | { | 1076 | 9.42k | goRiceParam++; | 1077 | 9.42k | levelThreshold <<= 1; | 1078 | 9.42k | } | 1079 | | | 1080 | 20.1k | const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31; | 1081 | 20.1k | c1Idx += isNonZero; | 1082 | | | 1083 | | /* update bin model */ | 1084 | 20.1k | if (level > 1) | 1085 | 9.42k | { | 1086 | 9.42k | c1 = 0; | 1087 | 9.42k | c2 += (uint32_t)(c2 - 2) >> 31; | 1088 | 9.42k | c2Idx++; | 1089 | 9.42k | } | 1090 | 10.7k | else if (((c1 == 1) | (c1 == 2)) & isNonZero) | 1091 | 10.7k | c1++; | 1092 | | | 1093 | 20.1k | if (dstCoeff[blkPos]) | 1094 | 20.1k | { | 1095 | 20.1k | sigCoeffGroupFlag64 |= cgBlkPosMask; | 1096 | 20.1k | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; | 1097 | 20.1k | cgRdStats.uncodedDist += costUncoded[blkPos]; | 1098 | 20.1k | cgRdStats.nnzBeforePos0 += scanPosinCG; | 1099 | 20.1k | } | 1100 | 20.1k | } | 1101 | | | 1102 | 322k | cgRdStats.sigCost += costSig[scanPos]; | 1103 | 322k | } /* end for (scanPosinCG) */ | 1104 | | | 1105 | 20.1k | X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n"); | 1106 | 20.1k | cgRdStats.sigCost0 = costSig[scanPos]; | 1107 | | | 1108 | 20.1k | costCoeffGroupSig[cgScanPos] = 0; | 1109 | | | 1110 | | /* nothing to do at this case */ | 1111 | 20.1k | X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n"); | 1112 | | | 1113 | 20.1k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1114 | 20.1k | { | 1115 | | /* coeff group 0 is implied to be present, no signal cost */ | 1116 | | /* coeff group with last NZ is implied to be present, handled below */ | 1117 | 20.1k | } | 1118 | 18.4E | else if (sigCoeffGroupFlag64 & cgBlkPosMask) | 1119 | 0 | { | 1120 | 0 | if (!cgRdStats.nnzBeforePos0) | 1121 | 0 | { | 1122 | | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ | 1123 | 0 | totalRdCost -= cgRdStats.sigCost0; | 1124 | 0 | cgRdStats.sigCost -= cgRdStats.sigCost0; | 1125 | 0 | } | 1126 | | | 1127 | | /* there are coded coefficients in this group, but now we include the signaling cost | 1128 | | * of the significant coefficient group flag and evaluate whether the RD cost of the | 1129 | | * coded group is more than the RD cost of the uncoded group */ | 1130 | |
| 1131 | 0 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1132 | |
| 1133 | 0 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1134 | 0 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ | 1135 | 0 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ | 1136 | 0 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ | 1137 | |
| 1138 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); | 1139 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ | 1140 | |
| 1141 | 0 | if (costZeroCG < totalRdCost && m_rdoqLevel > 1) | 1142 | 0 | { | 1143 | 0 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; | 1144 | 0 | totalRdCost = costZeroCG; | 1145 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1146 | | | 1147 | | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ | 1148 | 0 | const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize]; | 1149 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1150 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1151 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1152 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1153 | 0 | } | 1154 | 0 | } | 1155 | 18.4E | else | 1156 | 18.4E | { | 1157 | | /* there were no coded coefficients in this coefficient group */ | 1158 | 18.4E | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1159 | 18.4E | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 1160 | 18.4E | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 1161 | 18.4E | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ | 1162 | 18.4E | } | 1163 | 20.1k | } /* end for (cgScanPos) */ | 1164 | | | 1165 | 20.6k | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); | 1166 | | | 1167 | | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ | 1168 | 20.6k | int64_t bestCost; | 1169 | 20.6k | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) | 1170 | 0 | { | 1171 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); | 1172 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); | 1173 | 0 | } | 1174 | 20.6k | else | 1175 | 20.6k | { | 1176 | 20.6k | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; | 1177 | 20.6k | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); | 1178 | 20.6k | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); | 1179 | 20.6k | } | 1180 | | | 1181 | | /* This loop starts with the last non-zero found in the first loop and then refines this last | 1182 | | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs | 1183 | | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out | 1184 | | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty | 1185 | | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ | 1186 | 20.6k | int bestLastIdx = 0; | 1187 | 20.6k | bool foundLast = false; | 1188 | 40.8k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) | 1189 | 20.1k | { | 1190 | 20.1k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1191 | 20.1k | { | 1192 | | /* the presence of these coefficient groups are inferred, they have no bit in | 1193 | | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ | 1194 | 20.1k | } | 1195 | 0 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) | 1196 | 0 | { | 1197 | | /* remove cost of significant coeff group flag, the group's presence would be inferred | 1198 | | * from lastNZ if it were present in this group */ | 1199 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1200 | 0 | } | 1201 | 0 | else | 1202 | 0 | { | 1203 | | /* remove cost of signaling this empty group as not present */ | 1204 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1205 | 0 | continue; | 1206 | 0 | } | 1207 | | | 1208 | 333k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 1209 | 322k | { | 1210 | 322k | scanPos = cgScanPos * cgSize + scanPosinCG; | 1211 | 322k | if ((int)scanPos > lastScanPos) | 1212 | 302k | continue; | 1213 | | | 1214 | | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then | 1215 | | * continue as if it were uncoded. If the coefficient was already uncoded, remove the | 1216 | | * cost of signaling it as not-significant */ | 1217 | 20.1k | uint32_t blkPos = codeParams.scan[scanPos]; | 1218 | 20.1k | if (dstCoeff[blkPos]) | 1219 | 20.1k | { | 1220 | | // Calculates the cost of signaling the last significant coefficient in the block | 1221 | 20.1k | uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) }; | 1222 | 20.1k | if (codeParams.scanType == SCAN_VER) | 1223 | 373 | std::swap(pos[0], pos[1]); | 1224 | 20.1k | uint32_t bitsLastNZ = 0; | 1225 | | | 1226 | 60.5k | for (int i = 0; i < 2; i++) | 1227 | 40.3k | { | 1228 | 40.3k | int temp = g_lastCoeffTable[pos[i]]; | 1229 | 40.3k | int prefixOnes = temp & 15; | 1230 | 40.3k | int suffixLen = temp >> 4; | 1231 | | | 1232 | 40.3k | bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes]; | 1233 | 40.3k | bitsLastNZ += IEP_RATE * suffixLen; | 1234 | 40.3k | } | 1235 | | | 1236 | 20.1k | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); | 1237 | | | 1238 | 20.1k | if (costAsLast < bestCost) | 1239 | 10.4k | { | 1240 | 10.4k | bestLastIdx = scanPos + 1; | 1241 | 10.4k | bestCost = costAsLast; | 1242 | 10.4k | } | 1243 | 20.1k | if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) | 1244 | 9.42k | { | 1245 | 9.42k | foundLast = true; | 1246 | 9.42k | break; | 1247 | 9.42k | } | 1248 | | | 1249 | 10.7k | totalRdCost -= costCoeff[scanPos]; | 1250 | 10.7k | totalRdCost += costUncoded[blkPos]; | 1251 | 10.7k | } | 1252 | 18.4E | else | 1253 | 18.4E | totalRdCost -= costSig[scanPos]; | 1254 | 20.1k | } | 1255 | 20.1k | } | 1256 | | | 1257 | | /* recount non-zero coefficients and re-apply sign of DCT coef */ | 1258 | 20.6k | numSig = 0; | 1259 | 31.1k | for (int pos = 0; pos < bestLastIdx; pos++) | 1260 | 10.4k | { | 1261 | 10.4k | int blkPos = codeParams.scan[pos]; | 1262 | 10.4k | int level = dstCoeff[blkPos]; | 1263 | 10.4k | numSig += (level != 0); | 1264 | | | 1265 | 10.4k | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; | 1266 | 10.4k | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); | 1267 | 10.4k | } | 1268 | | | 1269 | | // Average 49.62 pixels | 1270 | | /* clean uncoded coefficients */ | 1271 | 20.6k | X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n"); | 1272 | 333k | for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++) | 1273 | 312k | { | 1274 | 312k | dstCoeff[codeParams.scan[pos]] = 0; | 1275 | 312k | } | 1276 | 20.6k | for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE) | 1277 | 0 | { | 1278 | 0 | const uint32_t blkPos = codeParams.scan[pos]; | 1279 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1280 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1281 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1282 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1283 | 0 | } | 1284 | | | 1285 | | /* rate-distortion based sign-hiding */ | 1286 | 20.6k | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) | 1287 | 0 | { | 1288 | 0 | const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE; | 1289 | 0 | int lastCG = 1; | 1290 | |
| 1291 | 0 | for (int subSet = realLastScanPos; subSet >= 0; subSet--) | 1292 | 0 | { | 1293 | 0 | int subPos = subSet << LOG2_SCAN_SET_SIZE; | 1294 | 0 | int n; | 1295 | |
| 1296 | 0 | if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet]))) | 1297 | 0 | continue; | 1298 | | | 1299 | | /* measure distance between first and last non-zero coef in this | 1300 | | * coding group */ | 1301 | 0 | const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]); | 1302 | 0 | const int firstNZPosInCG = (uint8_t)posFirstLast; | 1303 | 0 | const int lastNZPosInCG = (int8_t)(posFirstLast >> 8); | 1304 | 0 | const uint32_t absSumSign = posFirstLast; | 1305 | |
| 1306 | 0 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | 1307 | 0 | { | 1308 | 0 | const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]); | 1309 | |
| 1310 | | #if CHECKED_BUILD || _DEBUG | 1311 | | int32_t absSum_dummy = 0; | 1312 | | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | 1313 | | absSum_dummy += dstCoeff[codeParams.scan[n + subPos]]; | 1314 | | X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n"); | 1315 | | #endif | 1316 | | | 1317 | | //if (signbit != absSumSign) | 1318 | 0 | if (((int32_t)(signbit ^ absSumSign)) < 0) | 1319 | 0 | { | 1320 | | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff | 1321 | | * is properly implied. Note dstCoeff[] are signed by this point but curChange and | 1322 | | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ | 1323 | |
| 1324 | 0 | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; | 1325 | 0 | uint32_t minPos = 0; | 1326 | 0 | int8_t finalChange = 0; | 1327 | 0 | int curChange = 0; | 1328 | 0 | uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE; | 1329 | |
| 1330 | 0 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | 1331 | 0 | { | 1332 | 0 | const uint32_t blkPos = codeParams.scan[n + subPos]; | 1333 | 0 | const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 1334 | 0 | const int absLevel = abs(dstCoeff[blkPos]); | 1335 | | // TODO: this is constant in non-scaling mode | 1336 | 0 | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 1337 | 0 | const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound); | 1338 | |
| 1339 | 0 | int d = abs(signCoef) - (unQuantLevel >> unquantShift); | 1340 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n"); | 1341 | |
| 1342 | 0 | const int64_t origDist = (((int64_t)d * d)); | 1343 | |
| 1344 | 0 | #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8)) | 1345 | |
| 1346 | 0 | const uint32_t isOne = (absLevel == 1); | 1347 | 0 | if (dstCoeff[blkPos]) | 1348 | 0 | { | 1349 | 0 | d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift); | 1350 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1351 | 0 | int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]); | 1352 | | | 1353 | | /* if decrementing would make the coeff 0, we can include the | 1354 | | * significant coeff flag cost savings */ | 1355 | 0 | d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift); | 1356 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1357 | 0 | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); | 1358 | 0 | int64_t costDown = DELTARDCOST(origDist, d, downBits); | 1359 | |
| 1360 | 0 | costDown -= lastCoeffAdjust; | 1361 | 0 | curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown; | 1362 | |
| 1363 | 0 | curChange = 2 * (costUp < costDown) - 1; | 1364 | 0 | curCost = (costUp < costDown) ? costUp : curCost; | 1365 | 0 | } | 1366 | | //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31))) | 1367 | 0 | else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0)) | 1368 | 0 | { | 1369 | | /* don't try to make a new coded coeff before the first coeff if its | 1370 | | * sign would be different than the first coeff, the inferred sign would | 1371 | | * still be wrong and we'd have to do this again. */ | 1372 | 0 | curCost = MAX_INT64; | 1373 | 0 | } | 1374 | 0 | else | 1375 | 0 | { | 1376 | | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ | 1377 | 0 | d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift); | 1378 | 0 | X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n"); | 1379 | 0 | curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); | 1380 | 0 | curChange = 1; | 1381 | 0 | } | 1382 | |
| 1383 | 0 | if (curCost < minCostInc) | 1384 | 0 | { | 1385 | 0 | minCostInc = curCost; | 1386 | 0 | finalChange = (int8_t)curChange; | 1387 | 0 | minPos = blkPos + (absLevel << 16); | 1388 | 0 | } | 1389 | 0 | lastCoeffAdjust = 0; | 1390 | 0 | } | 1391 | |
| 1392 | 0 | const int absInMinPos = (minPos >> 16); | 1393 | 0 | minPos = (uint16_t)minPos; | 1394 | | | 1395 | | // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) | 1396 | 0 | if (absInMinPos >= 32767) | 1397 | | /* don't allow sign hiding to violate the SPEC range */ | 1398 | 0 | finalChange = -1; | 1399 | | | 1400 | | // NOTE: Reference code | 1401 | | //if (dstCoeff[minPos] == 0) | 1402 | | // numSig++; | 1403 | | //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) | 1404 | | // numSig--; | 1405 | 0 | numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1)); | 1406 | | | 1407 | | | 1408 | | // NOTE: Reference code | 1409 | | //if (m_resiDctCoeff[minPos] >= 0) | 1410 | | // dstCoeff[minPos] += finalChange; | 1411 | | //else | 1412 | | // dstCoeff[minPos] -= finalChange; | 1413 | 0 | const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16); | 1414 | 0 | dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign); | 1415 | 0 | } | 1416 | 0 | } | 1417 | |
| 1418 | 0 | lastCG = 0; | 1419 | 0 | } | 1420 | 0 | } | 1421 | | | 1422 | 20.6k | return numSig; | 1423 | 1.25M | } |
unsigned int x265::Quant::rdoQuant<4u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Line | Count | Source | 611 | 280k | { | 612 | 280k | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ | 613 | 280k | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; | 614 | 280k | const uint32_t usePsyMask = usePsy ? -1 : 0; | 615 | | | 616 | 280k | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); | 617 | | | 618 | 280k | int rem = m_qpParam[ttype].rem; | 619 | 280k | int per = m_qpParam[ttype].per; | 620 | 280k | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ | 621 | 280k | int add = (1 << (qbits - 1)); | 622 | 280k | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; | 623 | | | 624 | 280k | const int numCoeff = 1 << (log2TrSize * 2); | 625 | 280k | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); | 626 | 280k | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); | 627 | 280k | if (!numSig) | 628 | 269k | return 0; | 629 | 10.5k | const uint32_t trSize = 1 << log2TrSize; | 630 | 10.5k | int64_t lambda2 = m_qpParam[ttype].lambda2; | 631 | 10.5k | int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); | 632 | | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) | 633 | | * scale applied that must be removed during unquant. Note that in real dequant there is clipping | 634 | | * at several stages. We skip the clipping for simplicity when measuring RD cost */ | 635 | 10.5k | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; | 636 | 10.5k | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); | 637 | 10.5k | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; | 638 | 10.5k | const int scaleBits = SCALE_BITS - 2 * transformShift; | 639 | | | 640 | 10.5k | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) | 641 | 10.5k | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) | 642 | 10.5k | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) | 643 | 10.5k | #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) | 644 | | | 645 | 10.5k | int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ | 646 | 10.5k | int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ | 647 | 10.5k | int64_t costSig[trSize * trSize]; /* lambda * bits */ | 648 | | | 649 | 10.5k | int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ | 650 | 10.5k | int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */ | 651 | 10.5k | int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ | 652 | | | 653 | 10.5k | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ | 654 | 10.5k | uint64_t sigCoeffGroupFlag64 = 0; | 655 | | | 656 | 10.5k | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ | 657 | 10.5k | bool bIsLuma = ttype == TEXT_LUMA; | 658 | | | 659 | | /* total rate distortion cost of transform block, as CBF=0 */ | 660 | 10.5k | int64_t totalUncodedCost = 0; | 661 | | | 662 | | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, | 663 | | * the distortion and signal cost of coded blocks, and the coding cost of significant | 664 | | * coefficient and coefficient group bitmaps */ | 665 | 10.5k | int64_t totalRdCost = 0; | 666 | | | 667 | 10.5k | TUEntropyCodingParameters codeParams; | 668 | 10.5k | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); | 669 | 10.5k | const uint32_t log2TrSizeCG = log2TrSize - 2; | 670 | 10.5k | const uint32_t cgNum = 1 << (log2TrSizeCG * 2); | 671 | 10.5k | const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE); | 672 | | | 673 | 10.5k | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] | 674 | 10.5k | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign | 675 | 10.5k | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff | 676 | | | 677 | | #if CHECKED_BUILD || _DEBUG | 678 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group | 679 | | memset(coeffNum, 0, sizeof(coeffNum)); | 680 | | memset(coeffSign, 0, sizeof(coeffNum)); | 681 | | memset(coeffFlag, 0, sizeof(coeffNum)); | 682 | | #endif | 683 | 10.5k | const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); | 684 | 10.5k | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); | 685 | | | 686 | | | 687 | | /* TODO: update bit estimates if dirty */ | 688 | 10.5k | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; | 689 | | | 690 | 10.5k | uint32_t scanPos = 0; | 691 | 10.5k | uint32_t c1 = 1; | 692 | | | 693 | | // process trail all zero Coeff Group | 694 | | | 695 | | /* coefficients after lastNZ have no distortion signal cost */ | 696 | 10.5k | const int zeroCG = cgNum - 1 - cgLastScanPos; | 697 | 10.5k | memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 698 | 10.5k | memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 699 | | | 700 | | /* sum zero coeff (uncodec) cost */ | 701 | | | 702 | | // TODO: does we need these cost? | 703 | 10.5k | if (usePsyMask) | 704 | 6.46k | { | 705 | 103k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 706 | 97.0k | { | 707 | 97.0k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 708 | 97.0k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 709 | 97.0k | uint32_t blkPos = codeParams.scan[scanPosBase]; | 710 | 97.0k | #if X265_ARCH_X86 | 711 | 97.0k | bool enable512 = detect512(); | 712 | 97.0k | if (enable512) | 713 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 714 | 97.0k | else | 715 | 97.0k | { | 716 | 97.0k | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos); | 717 | 97.0k | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 718 | 97.0k | } | 719 | | #else | 720 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 721 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 722 | | #endif | 723 | 97.0k | } | 724 | 6.46k | } | 725 | 4.12k | else | 726 | 4.12k | { | 727 | | // non-psy path | 728 | 65.1k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 729 | 61.0k | { | 730 | 61.0k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 731 | 61.0k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 732 | 61.0k | uint32_t blkPos = codeParams.scan[scanPosBase]; | 733 | 61.0k | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 734 | 61.0k | } | 735 | 4.12k | } | 736 | 10.5k | static const uint8_t table_cnt[5][SCAN_SET_SIZE] = | 737 | 10.5k | { | 738 | | // patternSigCtx = 0 | 739 | 10.5k | { | 740 | 10.5k | 2, 1, 1, 0, | 741 | 10.5k | 1, 1, 0, 0, | 742 | 10.5k | 1, 0, 0, 0, | 743 | 10.5k | 0, 0, 0, 0, | 744 | 10.5k | }, | 745 | | // patternSigCtx = 1 | 746 | 10.5k | { | 747 | 10.5k | 2, 2, 2, 2, | 748 | 10.5k | 1, 1, 1, 1, | 749 | 10.5k | 0, 0, 0, 0, | 750 | 10.5k | 0, 0, 0, 0, | 751 | 10.5k | }, | 752 | | // patternSigCtx = 2 | 753 | 10.5k | { | 754 | 10.5k | 2, 1, 0, 0, | 755 | 10.5k | 2, 1, 0, 0, | 756 | 10.5k | 2, 1, 0, 0, | 757 | 10.5k | 2, 1, 0, 0, | 758 | 10.5k | }, | 759 | | // patternSigCtx = 3 | 760 | 10.5k | { | 761 | 10.5k | 2, 2, 2, 2, | 762 | 10.5k | 2, 2, 2, 2, | 763 | 10.5k | 2, 2, 2, 2, | 764 | 10.5k | 2, 2, 2, 2, | 765 | 10.5k | }, | 766 | | // 4x4 | 767 | 10.5k | { | 768 | 10.5k | 0, 1, 4, 5, | 769 | 10.5k | 2, 3, 4, 5, | 770 | 10.5k | 6, 6, 8, 8, | 771 | 10.5k | 7, 7, 8, 8 | 772 | 10.5k | } | 773 | 10.5k | }; | 774 | | | 775 | | /* iterate over coding groups in reverse scan order */ | 776 | 21.1k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--) | 777 | 10.5k | { | 778 | 10.5k | uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0; | 779 | 10.5k | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; | 780 | 10.5k | const uint32_t cgPosY = cgBlkPos >> log2TrSizeCG; | 781 | 10.5k | const uint32_t cgPosX = cgBlkPos & ((1 << log2TrSizeCG) - 1); | 782 | 10.5k | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); | 783 | 10.5k | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 784 | 10.5k | const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0); | 785 | | | 786 | 10.5k | if (c1 == 0) | 787 | 0 | ctxSet++; | 788 | 10.5k | c1 = 1; | 789 | | | 790 | 10.5k | if (cgScanPos && (coeffNum[cgScanPos] == 0)) | 791 | 0 | { | 792 | | // TODO: does we need zero-coeff cost? | 793 | 0 | const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 794 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 795 | 0 | if (usePsyMask) | 796 | 0 | { | 797 | 0 | #if X265_ARCH_X86 | 798 | 0 | bool enable512 = detect512(); | 799 | 0 | if (enable512) | 800 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 801 | 0 | else | 802 | 0 | { | 803 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 804 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 805 | 0 | } | 806 | | #else | 807 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 808 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 809 | | #endif | 810 | 0 | blkPos = codeParams.scan[scanPosBase]; | 811 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 812 | 0 | { | 813 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 814 | 0 | { | 815 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 816 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 817 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 818 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 819 | |
| 820 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 821 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 822 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 823 | 0 | } | 824 | 0 | blkPos += trSize; | 825 | 0 | } | 826 | 0 | } | 827 | 0 | else | 828 | 0 | { | 829 | | // non-psy path | 830 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 831 | 0 | blkPos = codeParams.scan[scanPosBase]; | 832 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 833 | 0 | { | 834 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 835 | 0 | { | 836 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 837 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 838 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 839 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 840 | |
| 841 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 842 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 843 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 844 | 0 | } | 845 | 0 | blkPos += trSize; | 846 | 0 | } | 847 | 0 | } | 848 | | | 849 | | /* there were no coded coefficients in this coefficient group */ | 850 | 0 | { | 851 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 852 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 853 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 854 | 0 | } | 855 | 0 | continue; | 856 | 0 | } | 857 | | | 858 | 10.5k | coeffGroupRDStats cgRdStats; | 859 | 10.5k | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); | 860 | | | 861 | 10.5k | uint32_t subFlagMask = coeffFlag[cgScanPos]; | 862 | 10.5k | int c2 = 0; | 863 | 10.5k | uint32_t goRiceParam = 0; | 864 | 10.5k | uint32_t levelThreshold = 3; | 865 | 10.5k | uint32_t c1Idx = 0; | 866 | 10.5k | uint32_t c2Idx = 0; | 867 | | /* iterate over coefficients in each group in reverse scan order */ | 868 | 179k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 869 | 168k | { | 870 | 168k | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; | 871 | 168k | uint32_t blkPos = codeParams.scan[scanPos]; | 872 | 168k | uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */ | 873 | 168k | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 874 | 168k | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ | 875 | | | 876 | | /* RDOQ measures distortion as the squared difference between the unquantized coded level | 877 | | * and the original DCT coefficient. The result is shifted scaleBits to account for the | 878 | | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ | 879 | | | 880 | | /* cost of not coding this coefficient (all distortion, no signal bits) */ | 881 | 168k | costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; | 882 | 168k | X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); | 883 | 168k | if (usePsyMask & scanPos) | 884 | | /* when no residual coefficient is coded, predicted coef == recon coef */ | 885 | 97.0k | costUncoded[blkPos] -= PSYVALUE(predictedCoef); | 886 | | | 887 | 168k | totalUncodedCost += costUncoded[blkPos]; | 888 | | | 889 | | // coefficient level estimation | 890 | 168k | const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1]; | 891 | | //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset; | 892 | 168k | static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL}; | 893 | 168k | uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx]; | 894 | 168k | const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset; | 895 | | // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset' | 896 | 168k | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 897 | | | 898 | | // before find lastest non-zero coeff | 899 | 168k | if (scanPos > (uint32_t)lastScanPos) | 900 | 158k | { | 901 | | /* coefficients after lastNZ have no distortion signal cost */ | 902 | 158k | costCoeff[scanPos] = 0; | 903 | 158k | costSig[scanPos] = 0; | 904 | | | 905 | | /* No non-zero coefficient yet found, but this does not mean | 906 | | * there is no uncoded-cost for this coefficient. Pre- | 907 | | * quantization the coefficient may have been non-zero */ | 908 | 158k | totalRdCost += costUncoded[blkPos]; | 909 | 158k | } | 910 | 10.5k | else if (!(subFlagMask & 1)) | 911 | 0 | { | 912 | | // fast zero coeff path | 913 | | /* set default costs to uncoded costs */ | 914 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 915 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 916 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 917 | 0 | totalRdCost += costCoeff[scanPos]; | 918 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; | 919 | |
| 920 | 0 | subFlagMask >>= 1; | 921 | 0 | } | 922 | 10.5k | else | 923 | 10.5k | { | 924 | 10.5k | subFlagMask >>= 1; | 925 | | | 926 | 10.5k | const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; | 927 | 10.5k | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3; // {1, 2, 1, 3} | 928 | | | 929 | 10.5k | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); | 930 | 10.5k | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); | 931 | 10.5k | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); | 932 | 10.5k | X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n"); | 933 | | | 934 | | // coefficient level estimation | 935 | 10.5k | const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2]; | 936 | 10.5k | const uint32_t c1c2Rate = ((c1c2idx & 1) ? greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0); | 937 | | | 938 | 10.5k | uint32_t level = 0; | 939 | 10.5k | uint32_t sigCoefBits = 0; | 940 | 10.5k | costCoeff[scanPos] = MAX_INT64; | 941 | | | 942 | 10.5k | if ((int)scanPos == lastScanPos) | 943 | 10.5k | sigRateDelta[blkPos] = 0; | 944 | 0 | else | 945 | 0 | { | 946 | 0 | if (maxAbsLevel < 3) | 947 | 0 | { | 948 | | /* set default costs to uncoded costs */ | 949 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 950 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 951 | 0 | } | 952 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 953 | 0 | sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; | 954 | 0 | } | 955 | | | 956 | 10.5k | const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound); | 957 | | // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1) | (0 < X < 2 ==> X=1) | 958 | 10.5k | if (maxAbsLevel == 1) | 959 | 2.58k | { | 960 | 2.58k | uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE; | 961 | 2.58k | X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n"); | 962 | | | 963 | 2.58k | int unquantAbsLevel = unQuantLevel >> unquantShift; | 964 | 2.58k | X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n"); | 965 | 2.58k | int d = abs(signCoef) - unquantAbsLevel; | 966 | 2.58k | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); | 967 | | | 968 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 969 | 2.58k | if (usePsyMask & scanPos) | 970 | 0 | { | 971 | 0 | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); | 972 | 0 | curCost -= PSYVALUE(reconCoef); | 973 | 0 | } | 974 | | | 975 | 2.58k | if (curCost < costCoeff[scanPos]) | 976 | 2.58k | { | 977 | 2.58k | level = 1; | 978 | 2.58k | costCoeff[scanPos] = curCost; | 979 | 2.58k | costSig[scanPos] = SIGCOST(sigCoefBits); | 980 | 2.58k | } | 981 | 2.58k | } | 982 | 7.95k | else if (maxAbsLevel) | 983 | 7.95k | { | 984 | 7.95k | uint32_t levelBits0 = getICRateCost(maxAbsLevel, maxAbsLevel - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 985 | 7.95k | uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 986 | | | 987 | 7.95k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 988 | | | 989 | 7.95k | const int unquantAbsLevel0 = unQuantLevel >> unquantShift; | 990 | 7.95k | X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n"); | 991 | 7.95k | int d0 = abs(signCoef) - unquantAbsLevel0; | 992 | 7.95k | int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0); | 993 | | | 994 | 7.95k | const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift; | 995 | 7.95k | X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n"); | 996 | 7.95k | int d1 = abs(signCoef) - unquantAbsLevel1; | 997 | 7.95k | int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1); | 998 | | | 999 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 1000 | 7.95k | if (usePsyMask & scanPos) | 1001 | 0 | { | 1002 | 0 | int reconCoef; | 1003 | 0 | reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef)); | 1004 | 0 | curCost0 -= PSYVALUE(reconCoef); | 1005 | |
| 1006 | 0 | reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef)); | 1007 | 0 | curCost1 -= PSYVALUE(reconCoef); | 1008 | 0 | } | 1009 | 7.95k | if (curCost0 < costCoeff[scanPos]) | 1010 | 7.95k | { | 1011 | 7.95k | level = maxAbsLevel; | 1012 | 7.95k | costCoeff[scanPos] = curCost0; | 1013 | 7.95k | costSig[scanPos] = SIGCOST(sigCoefBits); | 1014 | 7.95k | } | 1015 | 7.95k | if (curCost1 < costCoeff[scanPos]) | 1016 | 64 | { | 1017 | 64 | level = maxAbsLevel - 1; | 1018 | 64 | costCoeff[scanPos] = curCost1; | 1019 | 64 | costSig[scanPos] = SIGCOST(sigCoefBits); | 1020 | 64 | } | 1021 | 7.95k | } | 1022 | | | 1023 | 10.5k | dstCoeff[blkPos] = (int16_t)level; | 1024 | 10.5k | totalRdCost += costCoeff[scanPos]; | 1025 | | | 1026 | | /* record costs for sign-hiding performed at the end */ | 1027 | 10.5k | if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level) | 1028 | 10.5k | { | 1029 | 10.5k | const int32_t diff0 = level - 1 - baseLevel; | 1030 | 10.5k | const int32_t diff2 = level + 1 - baseLevel; | 1031 | 10.5k | const int32_t maxVlc = g_goRiceRange[goRiceParam]; | 1032 | 10.5k | int rate0, rate1, rate2; | 1033 | | | 1034 | 10.5k | if (diff0 < -2) // prob (92.9, 86.5, 74.5)% | 1035 | 2.58k | { | 1036 | | // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} | 1037 | | // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 | 1038 | 2.58k | X265_CHECK(level == 1, "absLevel check failure\n"); | 1039 | | | 1040 | 2.58k | const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; | 1041 | 2.58k | const int rateNotEqual2 = greaterOneBits[0]; | 1042 | | | 1043 | 2.58k | rate0 = 0; | 1044 | 2.58k | rate2 = rateEqual2; | 1045 | 2.58k | rate1 = rateNotEqual2; | 1046 | | | 1047 | 2.58k | X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1048 | 2.58k | X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1049 | 2.58k | X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1050 | 2.58k | } | 1051 | 7.95k | else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% | 1052 | 0 | { | 1053 | | // NOTE: no c1c2 correct rate since all of rate include this factor | 1054 | 0 | rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); | 1055 | 0 | rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); | 1056 | 0 | rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); | 1057 | 0 | } | 1058 | 7.95k | else | 1059 | 7.95k | { | 1060 | 7.95k | rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1061 | 7.95k | rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1062 | 7.95k | rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1063 | 7.95k | } | 1064 | 10.5k | rateIncUp[blkPos] = rate2 - rate1; | 1065 | 10.5k | rateIncDown[blkPos] = rate0 - rate1; | 1066 | 10.5k | } | 1067 | 0 | else | 1068 | 0 | { | 1069 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; | 1070 | 0 | rateIncDown[blkPos] = 0; | 1071 | 0 | } | 1072 | | | 1073 | | /* Update CABAC estimation state */ | 1074 | 10.5k | if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold)) | 1075 | 7.95k | { | 1076 | 7.95k | goRiceParam++; | 1077 | 7.95k | levelThreshold <<= 1; | 1078 | 7.95k | } | 1079 | | | 1080 | 10.5k | const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31; | 1081 | 10.5k | c1Idx += isNonZero; | 1082 | | | 1083 | | /* update bin model */ | 1084 | 10.5k | if (level > 1) | 1085 | 7.95k | { | 1086 | 7.95k | c1 = 0; | 1087 | 7.95k | c2 += (uint32_t)(c2 - 2) >> 31; | 1088 | 7.95k | c2Idx++; | 1089 | 7.95k | } | 1090 | 2.58k | else if (((c1 == 1) | (c1 == 2)) & isNonZero) | 1091 | 2.58k | c1++; | 1092 | | | 1093 | 10.5k | if (dstCoeff[blkPos]) | 1094 | 10.5k | { | 1095 | 10.5k | sigCoeffGroupFlag64 |= cgBlkPosMask; | 1096 | 10.5k | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; | 1097 | 10.5k | cgRdStats.uncodedDist += costUncoded[blkPos]; | 1098 | 10.5k | cgRdStats.nnzBeforePos0 += scanPosinCG; | 1099 | 10.5k | } | 1100 | 10.5k | } | 1101 | | | 1102 | 168k | cgRdStats.sigCost += costSig[scanPos]; | 1103 | 168k | } /* end for (scanPosinCG) */ | 1104 | | | 1105 | 10.5k | X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n"); | 1106 | 10.5k | cgRdStats.sigCost0 = costSig[scanPos]; | 1107 | | | 1108 | 10.5k | costCoeffGroupSig[cgScanPos] = 0; | 1109 | | | 1110 | | /* nothing to do at this case */ | 1111 | 10.5k | X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n"); | 1112 | | | 1113 | 10.5k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1114 | 10.5k | { | 1115 | | /* coeff group 0 is implied to be present, no signal cost */ | 1116 | | /* coeff group with last NZ is implied to be present, handled below */ | 1117 | 10.5k | } | 1118 | 0 | else if (sigCoeffGroupFlag64 & cgBlkPosMask) | 1119 | 0 | { | 1120 | 0 | if (!cgRdStats.nnzBeforePos0) | 1121 | 0 | { | 1122 | | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ | 1123 | 0 | totalRdCost -= cgRdStats.sigCost0; | 1124 | 0 | cgRdStats.sigCost -= cgRdStats.sigCost0; | 1125 | 0 | } | 1126 | | | 1127 | | /* there are coded coefficients in this group, but now we include the signaling cost | 1128 | | * of the significant coefficient group flag and evaluate whether the RD cost of the | 1129 | | * coded group is more than the RD cost of the uncoded group */ | 1130 | |
| 1131 | 0 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1132 | |
| 1133 | 0 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1134 | 0 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ | 1135 | 0 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ | 1136 | 0 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ | 1137 | |
| 1138 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); | 1139 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ | 1140 | |
| 1141 | 0 | if (costZeroCG < totalRdCost && m_rdoqLevel > 1) | 1142 | 0 | { | 1143 | 0 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; | 1144 | 0 | totalRdCost = costZeroCG; | 1145 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1146 | | | 1147 | | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ | 1148 | 0 | const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize]; | 1149 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1150 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1151 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1152 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1153 | 0 | } | 1154 | 0 | } | 1155 | 0 | else | 1156 | 0 | { | 1157 | | /* there were no coded coefficients in this coefficient group */ | 1158 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1159 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 1160 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 1161 | 0 | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ | 1162 | 0 | } | 1163 | 10.5k | } /* end for (cgScanPos) */ | 1164 | | | 1165 | 10.5k | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); | 1166 | | | 1167 | | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ | 1168 | 10.5k | int64_t bestCost; | 1169 | 10.5k | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) | 1170 | 0 | { | 1171 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); | 1172 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); | 1173 | 0 | } | 1174 | 10.5k | else | 1175 | 10.5k | { | 1176 | 10.5k | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; | 1177 | 10.5k | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); | 1178 | 10.5k | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); | 1179 | 10.5k | } | 1180 | | | 1181 | | /* This loop starts with the last non-zero found in the first loop and then refines this last | 1182 | | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs | 1183 | | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out | 1184 | | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty | 1185 | | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ | 1186 | 10.5k | int bestLastIdx = 0; | 1187 | 10.5k | bool foundLast = false; | 1188 | 21.1k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) | 1189 | 10.5k | { | 1190 | 10.5k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1191 | 10.5k | { | 1192 | | /* the presence of these coefficient groups are inferred, they have no bit in | 1193 | | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ | 1194 | 10.5k | } | 1195 | 0 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) | 1196 | 0 | { | 1197 | | /* remove cost of significant coeff group flag, the group's presence would be inferred | 1198 | | * from lastNZ if it were present in this group */ | 1199 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1200 | 0 | } | 1201 | 0 | else | 1202 | 0 | { | 1203 | | /* remove cost of signaling this empty group as not present */ | 1204 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1205 | 0 | continue; | 1206 | 0 | } | 1207 | | | 1208 | 171k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 1209 | 168k | { | 1210 | 168k | scanPos = cgScanPos * cgSize + scanPosinCG; | 1211 | 168k | if ((int)scanPos > lastScanPos) | 1212 | 158k | continue; | 1213 | | | 1214 | | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then | 1215 | | * continue as if it were uncoded. If the coefficient was already uncoded, remove the | 1216 | | * cost of signaling it as not-significant */ | 1217 | 10.5k | uint32_t blkPos = codeParams.scan[scanPos]; | 1218 | 10.5k | if (dstCoeff[blkPos]) | 1219 | 10.5k | { | 1220 | | // Calculates the cost of signaling the last significant coefficient in the block | 1221 | 10.5k | uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) }; | 1222 | 10.5k | if (codeParams.scanType == SCAN_VER) | 1223 | 0 | std::swap(pos[0], pos[1]); | 1224 | 10.5k | uint32_t bitsLastNZ = 0; | 1225 | | | 1226 | 31.6k | for (int i = 0; i < 2; i++) | 1227 | 21.0k | { | 1228 | 21.0k | int temp = g_lastCoeffTable[pos[i]]; | 1229 | 21.0k | int prefixOnes = temp & 15; | 1230 | 21.0k | int suffixLen = temp >> 4; | 1231 | | | 1232 | 21.0k | bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes]; | 1233 | 21.0k | bitsLastNZ += IEP_RATE * suffixLen; | 1234 | 21.0k | } | 1235 | | | 1236 | 10.5k | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); | 1237 | | | 1238 | 10.5k | if (costAsLast < bestCost) | 1239 | 9.79k | { | 1240 | 9.79k | bestLastIdx = scanPos + 1; | 1241 | 9.79k | bestCost = costAsLast; | 1242 | 9.79k | } | 1243 | 10.5k | if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) | 1244 | 7.95k | { | 1245 | 7.95k | foundLast = true; | 1246 | 7.95k | break; | 1247 | 7.95k | } | 1248 | | | 1249 | 2.58k | totalRdCost -= costCoeff[scanPos]; | 1250 | 2.58k | totalRdCost += costUncoded[blkPos]; | 1251 | 2.58k | } | 1252 | 0 | else | 1253 | 0 | totalRdCost -= costSig[scanPos]; | 1254 | 10.5k | } | 1255 | 10.5k | } | 1256 | | | 1257 | | /* recount non-zero coefficients and re-apply sign of DCT coef */ | 1258 | 10.5k | numSig = 0; | 1259 | 20.3k | for (int pos = 0; pos < bestLastIdx; pos++) | 1260 | 9.79k | { | 1261 | 9.79k | int blkPos = codeParams.scan[pos]; | 1262 | 9.79k | int level = dstCoeff[blkPos]; | 1263 | 9.79k | numSig += (level != 0); | 1264 | | | 1265 | 9.79k | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; | 1266 | 9.79k | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); | 1267 | 9.79k | } | 1268 | | | 1269 | | // Average 49.62 pixels | 1270 | | /* clean uncoded coefficients */ | 1271 | 10.5k | X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n"); | 1272 | 169k | for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++) | 1273 | 158k | { | 1274 | 158k | dstCoeff[codeParams.scan[pos]] = 0; | 1275 | 158k | } | 1276 | 10.5k | for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE) | 1277 | 0 | { | 1278 | 0 | const uint32_t blkPos = codeParams.scan[pos]; | 1279 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1280 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1281 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1282 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1283 | 0 | } | 1284 | | | 1285 | | /* rate-distortion based sign-hiding */ | 1286 | 10.5k | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) | 1287 | 0 | { | 1288 | 0 | const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE; | 1289 | 0 | int lastCG = 1; | 1290 | |
| 1291 | 0 | for (int subSet = realLastScanPos; subSet >= 0; subSet--) | 1292 | 0 | { | 1293 | 0 | int subPos = subSet << LOG2_SCAN_SET_SIZE; | 1294 | 0 | int n; | 1295 | |
| 1296 | 0 | if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet]))) | 1297 | 0 | continue; | 1298 | | | 1299 | | /* measure distance between first and last non-zero coef in this | 1300 | | * coding group */ | 1301 | 0 | const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]); | 1302 | 0 | const int firstNZPosInCG = (uint8_t)posFirstLast; | 1303 | 0 | const int lastNZPosInCG = (int8_t)(posFirstLast >> 8); | 1304 | 0 | const uint32_t absSumSign = posFirstLast; | 1305 | |
| 1306 | 0 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | 1307 | 0 | { | 1308 | 0 | const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]); | 1309 | |
| 1310 | | #if CHECKED_BUILD || _DEBUG | 1311 | | int32_t absSum_dummy = 0; | 1312 | | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | 1313 | | absSum_dummy += dstCoeff[codeParams.scan[n + subPos]]; | 1314 | | X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n"); | 1315 | | #endif | 1316 | | | 1317 | | //if (signbit != absSumSign) | 1318 | 0 | if (((int32_t)(signbit ^ absSumSign)) < 0) | 1319 | 0 | { | 1320 | | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff | 1321 | | * is properly implied. Note dstCoeff[] are signed by this point but curChange and | 1322 | | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ | 1323 | |
| 1324 | 0 | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; | 1325 | 0 | uint32_t minPos = 0; | 1326 | 0 | int8_t finalChange = 0; | 1327 | 0 | int curChange = 0; | 1328 | 0 | uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE; | 1329 | |
| 1330 | 0 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | 1331 | 0 | { | 1332 | 0 | const uint32_t blkPos = codeParams.scan[n + subPos]; | 1333 | 0 | const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 1334 | 0 | const int absLevel = abs(dstCoeff[blkPos]); | 1335 | | // TODO: this is constant in non-scaling mode | 1336 | 0 | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 1337 | 0 | const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound); | 1338 | |
| 1339 | 0 | int d = abs(signCoef) - (unQuantLevel >> unquantShift); | 1340 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n"); | 1341 | |
| 1342 | 0 | const int64_t origDist = (((int64_t)d * d)); | 1343 | |
| 1344 | 0 | #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8)) | 1345 | |
| 1346 | 0 | const uint32_t isOne = (absLevel == 1); | 1347 | 0 | if (dstCoeff[blkPos]) | 1348 | 0 | { | 1349 | 0 | d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift); | 1350 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1351 | 0 | int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]); | 1352 | | | 1353 | | /* if decrementing would make the coeff 0, we can include the | 1354 | | * significant coeff flag cost savings */ | 1355 | 0 | d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift); | 1356 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1357 | 0 | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); | 1358 | 0 | int64_t costDown = DELTARDCOST(origDist, d, downBits); | 1359 | |
| 1360 | 0 | costDown -= lastCoeffAdjust; | 1361 | 0 | curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown; | 1362 | |
| 1363 | 0 | curChange = 2 * (costUp < costDown) - 1; | 1364 | 0 | curCost = (costUp < costDown) ? costUp : curCost; | 1365 | 0 | } | 1366 | | //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31))) | 1367 | 0 | else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0)) | 1368 | 0 | { | 1369 | | /* don't try to make a new coded coeff before the first coeff if its | 1370 | | * sign would be different than the first coeff, the inferred sign would | 1371 | | * still be wrong and we'd have to do this again. */ | 1372 | 0 | curCost = MAX_INT64; | 1373 | 0 | } | 1374 | 0 | else | 1375 | 0 | { | 1376 | | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ | 1377 | 0 | d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift); | 1378 | 0 | X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n"); | 1379 | 0 | curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); | 1380 | 0 | curChange = 1; | 1381 | 0 | } | 1382 | |
| 1383 | 0 | if (curCost < minCostInc) | 1384 | 0 | { | 1385 | 0 | minCostInc = curCost; | 1386 | 0 | finalChange = (int8_t)curChange; | 1387 | 0 | minPos = blkPos + (absLevel << 16); | 1388 | 0 | } | 1389 | 0 | lastCoeffAdjust = 0; | 1390 | 0 | } | 1391 | |
| 1392 | 0 | const int absInMinPos = (minPos >> 16); | 1393 | 0 | minPos = (uint16_t)minPos; | 1394 | | | 1395 | | // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) | 1396 | 0 | if (absInMinPos >= 32767) | 1397 | | /* don't allow sign hiding to violate the SPEC range */ | 1398 | 0 | finalChange = -1; | 1399 | | | 1400 | | // NOTE: Reference code | 1401 | | //if (dstCoeff[minPos] == 0) | 1402 | | // numSig++; | 1403 | | //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) | 1404 | | // numSig--; | 1405 | 0 | numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1)); | 1406 | | | 1407 | | | 1408 | | // NOTE: Reference code | 1409 | | //if (m_resiDctCoeff[minPos] >= 0) | 1410 | | // dstCoeff[minPos] += finalChange; | 1411 | | //else | 1412 | | // dstCoeff[minPos] -= finalChange; | 1413 | 0 | const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16); | 1414 | 0 | dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign); | 1415 | 0 | } | 1416 | 0 | } | 1417 | |
| 1418 | 0 | lastCG = 0; | 1419 | 0 | } | 1420 | 0 | } | 1421 | | | 1422 | 10.5k | return numSig; | 1423 | 280k | } |
unsigned int x265::Quant::rdoQuant<5u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Line | Count | Source | 611 | 25.2k | { | 612 | 25.2k | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ | 613 | 25.2k | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; | 614 | 25.2k | const uint32_t usePsyMask = usePsy ? -1 : 0; | 615 | | | 616 | 25.2k | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); | 617 | | | 618 | 25.2k | int rem = m_qpParam[ttype].rem; | 619 | 25.2k | int per = m_qpParam[ttype].per; | 620 | 25.2k | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ | 621 | 25.2k | int add = (1 << (qbits - 1)); | 622 | 25.2k | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; | 623 | | | 624 | 25.2k | const int numCoeff = 1 << (log2TrSize * 2); | 625 | 25.2k | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); | 626 | 25.2k | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); | 627 | 25.2k | if (!numSig) | 628 | 22.5k | return 0; | 629 | 2.61k | const uint32_t trSize = 1 << log2TrSize; | 630 | 2.61k | int64_t lambda2 = m_qpParam[ttype].lambda2; | 631 | 2.61k | int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); | 632 | | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) | 633 | | * scale applied that must be removed during unquant. Note that in real dequant there is clipping | 634 | | * at several stages. We skip the clipping for simplicity when measuring RD cost */ | 635 | 2.61k | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; | 636 | 2.61k | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); | 637 | 2.61k | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; | 638 | 2.61k | const int scaleBits = SCALE_BITS - 2 * transformShift; | 639 | | | 640 | 2.61k | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) | 641 | 2.61k | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) | 642 | 2.61k | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) | 643 | 2.61k | #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) | 644 | | | 645 | 2.61k | int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ | 646 | 2.61k | int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ | 647 | 2.61k | int64_t costSig[trSize * trSize]; /* lambda * bits */ | 648 | | | 649 | 2.61k | int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ | 650 | 2.61k | int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */ | 651 | 2.61k | int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ | 652 | | | 653 | 2.61k | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ | 654 | 2.61k | uint64_t sigCoeffGroupFlag64 = 0; | 655 | | | 656 | 2.61k | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ | 657 | 2.61k | bool bIsLuma = ttype == TEXT_LUMA; | 658 | | | 659 | | /* total rate distortion cost of transform block, as CBF=0 */ | 660 | 2.61k | int64_t totalUncodedCost = 0; | 661 | | | 662 | | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, | 663 | | * the distortion and signal cost of coded blocks, and the coding cost of significant | 664 | | * coefficient and coefficient group bitmaps */ | 665 | 2.61k | int64_t totalRdCost = 0; | 666 | | | 667 | 2.61k | TUEntropyCodingParameters codeParams; | 668 | 2.61k | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); | 669 | 2.61k | const uint32_t log2TrSizeCG = log2TrSize - 2; | 670 | 2.61k | const uint32_t cgNum = 1 << (log2TrSizeCG * 2); | 671 | 2.61k | const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE); | 672 | | | 673 | 2.61k | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] | 674 | 2.61k | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign | 675 | 2.61k | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff | 676 | | | 677 | | #if CHECKED_BUILD || _DEBUG | 678 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group | 679 | | memset(coeffNum, 0, sizeof(coeffNum)); | 680 | | memset(coeffSign, 0, sizeof(coeffNum)); | 681 | | memset(coeffFlag, 0, sizeof(coeffNum)); | 682 | | #endif | 683 | 2.61k | const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); | 684 | 2.61k | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); | 685 | | | 686 | | | 687 | | /* TODO: update bit estimates if dirty */ | 688 | 2.61k | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; | 689 | | | 690 | 2.61k | uint32_t scanPos = 0; | 691 | 2.61k | uint32_t c1 = 1; | 692 | | | 693 | | // process trail all zero Coeff Group | 694 | | | 695 | | /* coefficients after lastNZ have no distortion signal cost */ | 696 | 2.61k | const int zeroCG = cgNum - 1 - cgLastScanPos; | 697 | 2.61k | memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 698 | 2.61k | memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); | 699 | | | 700 | | /* sum zero coeff (uncodec) cost */ | 701 | | | 702 | | // TODO: does we need these cost? | 703 | 2.61k | if (usePsyMask) | 704 | 2.61k | { | 705 | 167k | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 706 | 164k | { | 707 | 164k | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 708 | 164k | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 709 | 164k | uint32_t blkPos = codeParams.scan[scanPosBase]; | 710 | 164k | #if X265_ARCH_X86 | 711 | 164k | bool enable512 = detect512(); | 712 | 164k | if (enable512) | 713 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 714 | 164k | else | 715 | 164k | { | 716 | 164k | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos); | 717 | 164k | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 718 | 164k | } | 719 | | #else | 720 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 721 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 722 | | #endif | 723 | 164k | } | 724 | 2.61k | } | 725 | 1 | else | 726 | 1 | { | 727 | | // non-psy path | 728 | 1 | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) | 729 | 0 | { | 730 | 0 | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); | 731 | 0 | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 732 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 733 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 734 | 0 | } | 735 | 1 | } | 736 | 2.61k | static const uint8_t table_cnt[5][SCAN_SET_SIZE] = | 737 | 2.61k | { | 738 | | // patternSigCtx = 0 | 739 | 2.61k | { | 740 | 2.61k | 2, 1, 1, 0, | 741 | 2.61k | 1, 1, 0, 0, | 742 | 2.61k | 1, 0, 0, 0, | 743 | 2.61k | 0, 0, 0, 0, | 744 | 2.61k | }, | 745 | | // patternSigCtx = 1 | 746 | 2.61k | { | 747 | 2.61k | 2, 2, 2, 2, | 748 | 2.61k | 1, 1, 1, 1, | 749 | 2.61k | 0, 0, 0, 0, | 750 | 2.61k | 0, 0, 0, 0, | 751 | 2.61k | }, | 752 | | // patternSigCtx = 2 | 753 | 2.61k | { | 754 | 2.61k | 2, 1, 0, 0, | 755 | 2.61k | 2, 1, 0, 0, | 756 | 2.61k | 2, 1, 0, 0, | 757 | 2.61k | 2, 1, 0, 0, | 758 | 2.61k | }, | 759 | | // patternSigCtx = 3 | 760 | 2.61k | { | 761 | 2.61k | 2, 2, 2, 2, | 762 | 2.61k | 2, 2, 2, 2, | 763 | 2.61k | 2, 2, 2, 2, | 764 | 2.61k | 2, 2, 2, 2, | 765 | 2.61k | }, | 766 | | // 4x4 | 767 | 2.61k | { | 768 | 2.61k | 0, 1, 4, 5, | 769 | 2.61k | 2, 3, 4, 5, | 770 | 2.61k | 6, 6, 8, 8, | 771 | 2.61k | 7, 7, 8, 8 | 772 | 2.61k | } | 773 | 2.61k | }; | 774 | | | 775 | | /* iterate over coding groups in reverse scan order */ | 776 | 5.22k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--) | 777 | 2.61k | { | 778 | 2.61k | uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0; | 779 | 2.61k | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; | 780 | 2.61k | const uint32_t cgPosY = cgBlkPos >> log2TrSizeCG; | 781 | 2.61k | const uint32_t cgPosX = cgBlkPos & ((1 << log2TrSizeCG) - 1); | 782 | 2.61k | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); | 783 | 2.61k | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 784 | 2.61k | const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0); | 785 | | | 786 | 2.61k | if (c1 == 0) | 787 | 0 | ctxSet++; | 788 | 2.61k | c1 = 1; | 789 | | | 790 | 2.61k | if (cgScanPos && (coeffNum[cgScanPos] == 0)) | 791 | 0 | { | 792 | | // TODO: does we need zero-coeff cost? | 793 | 0 | const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); | 794 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; | 795 | 0 | if (usePsyMask) | 796 | 0 | { | 797 | 0 | #if X265_ARCH_X86 | 798 | 0 | bool enable512 = detect512(); | 799 | 0 | if (enable512) | 800 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 801 | 0 | else | 802 | 0 | { | 803 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 804 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 805 | 0 | } | 806 | | #else | 807 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 808 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); | 809 | | #endif | 810 | 0 | blkPos = codeParams.scan[scanPosBase]; | 811 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 812 | 0 | { | 813 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 814 | 0 | { | 815 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 816 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 817 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 818 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 819 | |
| 820 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 821 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 822 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 823 | 0 | } | 824 | 0 | blkPos += trSize; | 825 | 0 | } | 826 | 0 | } | 827 | 0 | else | 828 | 0 | { | 829 | | // non-psy path | 830 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); | 831 | 0 | blkPos = codeParams.scan[scanPosBase]; | 832 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) | 833 | 0 | { | 834 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) | 835 | 0 | { | 836 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; | 837 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; | 838 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); | 839 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 840 | |
| 841 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 842 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; | 843 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 844 | 0 | } | 845 | 0 | blkPos += trSize; | 846 | 0 | } | 847 | 0 | } | 848 | | | 849 | | /* there were no coded coefficients in this coefficient group */ | 850 | 0 | { | 851 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 852 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 853 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 854 | 0 | } | 855 | 0 | continue; | 856 | 0 | } | 857 | | | 858 | 2.61k | coeffGroupRDStats cgRdStats; | 859 | 2.61k | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); | 860 | | | 861 | 2.61k | uint32_t subFlagMask = coeffFlag[cgScanPos]; | 862 | 2.61k | int c2 = 0; | 863 | 2.61k | uint32_t goRiceParam = 0; | 864 | 2.61k | uint32_t levelThreshold = 3; | 865 | 2.61k | uint32_t c1Idx = 0; | 866 | 2.61k | uint32_t c2Idx = 0; | 867 | | /* iterate over coefficients in each group in reverse scan order */ | 868 | 44.3k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 869 | 41.7k | { | 870 | 41.7k | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; | 871 | 41.7k | uint32_t blkPos = codeParams.scan[scanPos]; | 872 | 41.7k | uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */ | 873 | 41.7k | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 874 | 41.7k | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ | 875 | | | 876 | | /* RDOQ measures distortion as the squared difference between the unquantized coded level | 877 | | * and the original DCT coefficient. The result is shifted scaleBits to account for the | 878 | | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ | 879 | | | 880 | | /* cost of not coding this coefficient (all distortion, no signal bits) */ | 881 | 41.7k | costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; | 882 | 41.7k | X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); | 883 | 41.7k | if (usePsyMask & scanPos) | 884 | | /* when no residual coefficient is coded, predicted coef == recon coef */ | 885 | 39.1k | costUncoded[blkPos] -= PSYVALUE(predictedCoef); | 886 | | | 887 | 41.7k | totalUncodedCost += costUncoded[blkPos]; | 888 | | | 889 | | // coefficient level estimation | 890 | 41.7k | const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1]; | 891 | | //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset; | 892 | 41.7k | static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL}; | 893 | 41.7k | uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx]; | 894 | 41.7k | const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset; | 895 | | // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset' | 896 | 41.7k | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); | 897 | | | 898 | | // before find lastest non-zero coeff | 899 | 41.7k | if (scanPos > (uint32_t)lastScanPos) | 900 | 39.1k | { | 901 | | /* coefficients after lastNZ have no distortion signal cost */ | 902 | 39.1k | costCoeff[scanPos] = 0; | 903 | 39.1k | costSig[scanPos] = 0; | 904 | | | 905 | | /* No non-zero coefficient yet found, but this does not mean | 906 | | * there is no uncoded-cost for this coefficient. Pre- | 907 | | * quantization the coefficient may have been non-zero */ | 908 | 39.1k | totalRdCost += costUncoded[blkPos]; | 909 | 39.1k | } | 910 | 2.61k | else if (!(subFlagMask & 1)) | 911 | 0 | { | 912 | | // fast zero coeff path | 913 | | /* set default costs to uncoded costs */ | 914 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 915 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 916 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 917 | 0 | totalRdCost += costCoeff[scanPos]; | 918 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; | 919 | |
| 920 | 0 | subFlagMask >>= 1; | 921 | 0 | } | 922 | 2.61k | else | 923 | 2.61k | { | 924 | 2.61k | subFlagMask >>= 1; | 925 | | | 926 | 2.61k | const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; | 927 | 2.61k | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3; // {1, 2, 1, 3} | 928 | | | 929 | 2.61k | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); | 930 | 2.61k | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); | 931 | 2.61k | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); | 932 | 2.61k | X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n"); | 933 | | | 934 | | // coefficient level estimation | 935 | 2.61k | const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2]; | 936 | 2.61k | const uint32_t c1c2Rate = ((c1c2idx & 1) ? greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0); | 937 | | | 938 | 2.61k | uint32_t level = 0; | 939 | 2.61k | uint32_t sigCoefBits = 0; | 940 | 2.61k | costCoeff[scanPos] = MAX_INT64; | 941 | | | 942 | 2.61k | if ((int)scanPos == lastScanPos) | 943 | 2.61k | sigRateDelta[blkPos] = 0; | 944 | 0 | else | 945 | 0 | { | 946 | 0 | if (maxAbsLevel < 3) | 947 | 0 | { | 948 | | /* set default costs to uncoded costs */ | 949 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); | 950 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; | 951 | 0 | } | 952 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; | 953 | 0 | sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; | 954 | 0 | } | 955 | | | 956 | 2.61k | const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound); | 957 | | // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1) | (0 < X < 2 ==> X=1) | 958 | 2.61k | if (maxAbsLevel == 1) | 959 | 0 | { | 960 | 0 | uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE; | 961 | 0 | X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n"); | 962 | |
| 963 | 0 | int unquantAbsLevel = unQuantLevel >> unquantShift; | 964 | 0 | X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n"); | 965 | 0 | int d = abs(signCoef) - unquantAbsLevel; | 966 | 0 | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); | 967 | | | 968 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 969 | 0 | if (usePsyMask & scanPos) | 970 | 0 | { | 971 | 0 | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); | 972 | 0 | curCost -= PSYVALUE(reconCoef); | 973 | 0 | } | 974 | |
| 975 | 0 | if (curCost < costCoeff[scanPos]) | 976 | 0 | { | 977 | 0 | level = 1; | 978 | 0 | costCoeff[scanPos] = curCost; | 979 | 0 | costSig[scanPos] = SIGCOST(sigCoefBits); | 980 | 0 | } | 981 | 0 | } | 982 | 2.61k | else if (maxAbsLevel) | 983 | 2.61k | { | 984 | 2.61k | uint32_t levelBits0 = getICRateCost(maxAbsLevel, maxAbsLevel - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 985 | 2.61k | uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; | 986 | | | 987 | 2.61k | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 988 | | | 989 | 2.61k | const int unquantAbsLevel0 = unQuantLevel >> unquantShift; | 990 | 2.61k | X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n"); | 991 | 2.61k | int d0 = abs(signCoef) - unquantAbsLevel0; | 992 | 2.61k | int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0); | 993 | | | 994 | 2.61k | const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift; | 995 | 2.61k | X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n"); | 996 | 2.61k | int d1 = abs(signCoef) - unquantAbsLevel1; | 997 | 2.61k | int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1); | 998 | | | 999 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | 1000 | 2.61k | if (usePsyMask & scanPos) | 1001 | 0 | { | 1002 | 0 | int reconCoef; | 1003 | 0 | reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef)); | 1004 | 0 | curCost0 -= PSYVALUE(reconCoef); | 1005 | |
| 1006 | 0 | reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef)); | 1007 | 0 | curCost1 -= PSYVALUE(reconCoef); | 1008 | 0 | } | 1009 | 2.61k | if (curCost0 < costCoeff[scanPos]) | 1010 | 2.61k | { | 1011 | 2.61k | level = maxAbsLevel; | 1012 | 2.61k | costCoeff[scanPos] = curCost0; | 1013 | 2.61k | costSig[scanPos] = SIGCOST(sigCoefBits); | 1014 | 2.61k | } | 1015 | 2.61k | if (curCost1 < costCoeff[scanPos]) | 1016 | 70 | { | 1017 | 70 | level = maxAbsLevel - 1; | 1018 | 70 | costCoeff[scanPos] = curCost1; | 1019 | 70 | costSig[scanPos] = SIGCOST(sigCoefBits); | 1020 | 70 | } | 1021 | 2.61k | } | 1022 | | | 1023 | 2.61k | dstCoeff[blkPos] = (int16_t)level; | 1024 | 2.61k | totalRdCost += costCoeff[scanPos]; | 1025 | | | 1026 | | /* record costs for sign-hiding performed at the end */ | 1027 | 2.61k | if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level) | 1028 | 2.61k | { | 1029 | 2.61k | const int32_t diff0 = level - 1 - baseLevel; | 1030 | 2.61k | const int32_t diff2 = level + 1 - baseLevel; | 1031 | 2.61k | const int32_t maxVlc = g_goRiceRange[goRiceParam]; | 1032 | 2.61k | int rate0, rate1, rate2; | 1033 | | | 1034 | 2.61k | if (diff0 < -2) // prob (92.9, 86.5, 74.5)% | 1035 | 0 | { | 1036 | | // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} | 1037 | | // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 | 1038 | 0 | X265_CHECK(level == 1, "absLevel check failure\n"); | 1039 | |
| 1040 | 0 | const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; | 1041 | 0 | const int rateNotEqual2 = greaterOneBits[0]; | 1042 | |
| 1043 | 0 | rate0 = 0; | 1044 | 0 | rate2 = rateEqual2; | 1045 | 0 | rate1 = rateNotEqual2; | 1046 | |
| 1047 | 0 | X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1048 | 0 | X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1049 | 0 | X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); | 1050 | 0 | } | 1051 | 2.61k | else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% | 1052 | 0 | { | 1053 | | // NOTE: no c1c2 correct rate since all of rate include this factor | 1054 | 0 | rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); | 1055 | 0 | rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); | 1056 | 0 | rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); | 1057 | 0 | } | 1058 | 2.61k | else | 1059 | 2.61k | { | 1060 | 2.61k | rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1061 | 2.61k | rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1062 | 2.61k | rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); | 1063 | 2.61k | } | 1064 | 2.61k | rateIncUp[blkPos] = rate2 - rate1; | 1065 | 2.61k | rateIncDown[blkPos] = rate0 - rate1; | 1066 | 2.61k | } | 1067 | 0 | else | 1068 | 0 | { | 1069 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; | 1070 | 0 | rateIncDown[blkPos] = 0; | 1071 | 0 | } | 1072 | | | 1073 | | /* Update CABAC estimation state */ | 1074 | 2.61k | if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold)) | 1075 | 2.61k | { | 1076 | 2.61k | goRiceParam++; | 1077 | 2.61k | levelThreshold <<= 1; | 1078 | 2.61k | } | 1079 | | | 1080 | 2.61k | const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31; | 1081 | 2.61k | c1Idx += isNonZero; | 1082 | | | 1083 | | /* update bin model */ | 1084 | 2.61k | if (level > 1) | 1085 | 2.61k | { | 1086 | 2.61k | c1 = 0; | 1087 | 2.61k | c2 += (uint32_t)(c2 - 2) >> 31; | 1088 | 2.61k | c2Idx++; | 1089 | 2.61k | } | 1090 | 0 | else if (((c1 == 1) | (c1 == 2)) & isNonZero) | 1091 | 0 | c1++; | 1092 | | | 1093 | 2.61k | if (dstCoeff[blkPos]) | 1094 | 2.61k | { | 1095 | 2.61k | sigCoeffGroupFlag64 |= cgBlkPosMask; | 1096 | 2.61k | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; | 1097 | 2.61k | cgRdStats.uncodedDist += costUncoded[blkPos]; | 1098 | 2.61k | cgRdStats.nnzBeforePos0 += scanPosinCG; | 1099 | 2.61k | } | 1100 | 2.61k | } | 1101 | | | 1102 | 41.7k | cgRdStats.sigCost += costSig[scanPos]; | 1103 | 41.7k | } /* end for (scanPosinCG) */ | 1104 | | | 1105 | 2.61k | X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n"); | 1106 | 2.61k | cgRdStats.sigCost0 = costSig[scanPos]; | 1107 | | | 1108 | 2.61k | costCoeffGroupSig[cgScanPos] = 0; | 1109 | | | 1110 | | /* nothing to do at this case */ | 1111 | 2.61k | X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n"); | 1112 | | | 1113 | 2.61k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1114 | 2.61k | { | 1115 | | /* coeff group 0 is implied to be present, no signal cost */ | 1116 | | /* coeff group with last NZ is implied to be present, handled below */ | 1117 | 2.61k | } | 1118 | 0 | else if (sigCoeffGroupFlag64 & cgBlkPosMask) | 1119 | 0 | { | 1120 | 0 | if (!cgRdStats.nnzBeforePos0) | 1121 | 0 | { | 1122 | | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ | 1123 | 0 | totalRdCost -= cgRdStats.sigCost0; | 1124 | 0 | cgRdStats.sigCost -= cgRdStats.sigCost0; | 1125 | 0 | } | 1126 | | | 1127 | | /* there are coded coefficients in this group, but now we include the signaling cost | 1128 | | * of the significant coefficient group flag and evaluate whether the RD cost of the | 1129 | | * coded group is more than the RD cost of the uncoded group */ | 1130 | |
| 1131 | 0 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1132 | |
| 1133 | 0 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1134 | 0 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ | 1135 | 0 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ | 1136 | 0 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ | 1137 | |
| 1138 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); | 1139 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ | 1140 | |
| 1141 | 0 | if (costZeroCG < totalRdCost && m_rdoqLevel > 1) | 1142 | 0 | { | 1143 | 0 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; | 1144 | 0 | totalRdCost = costZeroCG; | 1145 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | 1146 | | | 1147 | | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ | 1148 | 0 | const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize]; | 1149 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1150 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1151 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1152 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1153 | 0 | } | 1154 | 0 | } | 1155 | 0 | else | 1156 | 0 | { | 1157 | | /* there were no coded coefficients in this coefficient group */ | 1158 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); | 1159 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | 1160 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | 1161 | 0 | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ | 1162 | 0 | } | 1163 | 2.61k | } /* end for (cgScanPos) */ | 1164 | | | 1165 | 2.61k | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); | 1166 | | | 1167 | | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ | 1168 | 2.61k | int64_t bestCost; | 1169 | 2.61k | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) | 1170 | 0 | { | 1171 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); | 1172 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); | 1173 | 0 | } | 1174 | 2.61k | else | 1175 | 2.61k | { | 1176 | 2.61k | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; | 1177 | 2.61k | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); | 1178 | 2.61k | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); | 1179 | 2.61k | } | 1180 | | | 1181 | | /* This loop starts with the last non-zero found in the first loop and then refines this last | 1182 | | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs | 1183 | | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out | 1184 | | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty | 1185 | | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ | 1186 | 2.61k | int bestLastIdx = 0; | 1187 | 2.61k | bool foundLast = false; | 1188 | 5.22k | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) | 1189 | 2.61k | { | 1190 | 2.61k | if (!cgScanPos || cgScanPos == cgLastScanPos) | 1191 | 2.61k | { | 1192 | | /* the presence of these coefficient groups are inferred, they have no bit in | 1193 | | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ | 1194 | 2.61k | } | 1195 | 0 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) | 1196 | 0 | { | 1197 | | /* remove cost of significant coeff group flag, the group's presence would be inferred | 1198 | | * from lastNZ if it were present in this group */ | 1199 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1200 | 0 | } | 1201 | 0 | else | 1202 | 0 | { | 1203 | | /* remove cost of signaling this empty group as not present */ | 1204 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | 1205 | 0 | continue; | 1206 | 0 | } | 1207 | | | 1208 | 41.7k | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | 1209 | 41.7k | { | 1210 | 41.7k | scanPos = cgScanPos * cgSize + scanPosinCG; | 1211 | 41.7k | if ((int)scanPos > lastScanPos) | 1212 | 39.1k | continue; | 1213 | | | 1214 | | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then | 1215 | | * continue as if it were uncoded. If the coefficient was already uncoded, remove the | 1216 | | * cost of signaling it as not-significant */ | 1217 | 2.61k | uint32_t blkPos = codeParams.scan[scanPos]; | 1218 | 2.61k | if (dstCoeff[blkPos]) | 1219 | 2.61k | { | 1220 | | // Calculates the cost of signaling the last significant coefficient in the block | 1221 | 2.61k | uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) }; | 1222 | 2.61k | if (codeParams.scanType == SCAN_VER) | 1223 | 0 | std::swap(pos[0], pos[1]); | 1224 | 2.61k | uint32_t bitsLastNZ = 0; | 1225 | | | 1226 | 7.83k | for (int i = 0; i < 2; i++) | 1227 | 5.22k | { | 1228 | 5.22k | int temp = g_lastCoeffTable[pos[i]]; | 1229 | 5.22k | int prefixOnes = temp & 15; | 1230 | 5.22k | int suffixLen = temp >> 4; | 1231 | | | 1232 | 5.22k | bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes]; | 1233 | 5.22k | bitsLastNZ += IEP_RATE * suffixLen; | 1234 | 5.22k | } | 1235 | | | 1236 | 2.61k | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); | 1237 | | | 1238 | 2.61k | if (costAsLast < bestCost) | 1239 | 2.61k | { | 1240 | 2.61k | bestLastIdx = scanPos + 1; | 1241 | 2.61k | bestCost = costAsLast; | 1242 | 2.61k | } | 1243 | 2.61k | if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) | 1244 | 2.61k | { | 1245 | 2.61k | foundLast = true; | 1246 | 2.61k | break; | 1247 | 2.61k | } | 1248 | | | 1249 | 0 | totalRdCost -= costCoeff[scanPos]; | 1250 | 0 | totalRdCost += costUncoded[blkPos]; | 1251 | 0 | } | 1252 | 0 | else | 1253 | 0 | totalRdCost -= costSig[scanPos]; | 1254 | 2.61k | } | 1255 | 2.61k | } | 1256 | | | 1257 | | /* recount non-zero coefficients and re-apply sign of DCT coef */ | 1258 | 2.61k | numSig = 0; | 1259 | 5.22k | for (int pos = 0; pos < bestLastIdx; pos++) | 1260 | 2.61k | { | 1261 | 2.61k | int blkPos = codeParams.scan[pos]; | 1262 | 2.61k | int level = dstCoeff[blkPos]; | 1263 | 2.61k | numSig += (level != 0); | 1264 | | | 1265 | 2.61k | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; | 1266 | 2.61k | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); | 1267 | 2.61k | } | 1268 | | | 1269 | | // Average 49.62 pixels | 1270 | | /* clean uncoded coefficients */ | 1271 | 2.61k | X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n"); | 1272 | 41.7k | for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++) | 1273 | 39.1k | { | 1274 | 39.1k | dstCoeff[codeParams.scan[pos]] = 0; | 1275 | 39.1k | } | 1276 | 2.61k | for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE) | 1277 | 0 | { | 1278 | 0 | const uint32_t blkPos = codeParams.scan[pos]; | 1279 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1280 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1281 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1282 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); | 1283 | 0 | } | 1284 | | | 1285 | | /* rate-distortion based sign-hiding */ | 1286 | 2.61k | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) | 1287 | 0 | { | 1288 | 0 | const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE; | 1289 | 0 | int lastCG = 1; | 1290 | |
| 1291 | 0 | for (int subSet = realLastScanPos; subSet >= 0; subSet--) | 1292 | 0 | { | 1293 | 0 | int subPos = subSet << LOG2_SCAN_SET_SIZE; | 1294 | 0 | int n; | 1295 | |
| 1296 | 0 | if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet]))) | 1297 | 0 | continue; | 1298 | | | 1299 | | /* measure distance between first and last non-zero coef in this | 1300 | | * coding group */ | 1301 | 0 | const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]); | 1302 | 0 | const int firstNZPosInCG = (uint8_t)posFirstLast; | 1303 | 0 | const int lastNZPosInCG = (int8_t)(posFirstLast >> 8); | 1304 | 0 | const uint32_t absSumSign = posFirstLast; | 1305 | |
| 1306 | 0 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | 1307 | 0 | { | 1308 | 0 | const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]); | 1309 | |
| 1310 | | #if CHECKED_BUILD || _DEBUG | 1311 | | int32_t absSum_dummy = 0; | 1312 | | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | 1313 | | absSum_dummy += dstCoeff[codeParams.scan[n + subPos]]; | 1314 | | X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n"); | 1315 | | #endif | 1316 | | | 1317 | | //if (signbit != absSumSign) | 1318 | 0 | if (((int32_t)(signbit ^ absSumSign)) < 0) | 1319 | 0 | { | 1320 | | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff | 1321 | | * is properly implied. Note dstCoeff[] are signed by this point but curChange and | 1322 | | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ | 1323 | |
| 1324 | 0 | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; | 1325 | 0 | uint32_t minPos = 0; | 1326 | 0 | int8_t finalChange = 0; | 1327 | 0 | int curChange = 0; | 1328 | 0 | uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE; | 1329 | |
| 1330 | 0 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | 1331 | 0 | { | 1332 | 0 | const uint32_t blkPos = codeParams.scan[n + subPos]; | 1333 | 0 | const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | 1334 | 0 | const int absLevel = abs(dstCoeff[blkPos]); | 1335 | | // TODO: this is constant in non-scaling mode | 1336 | 0 | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); | 1337 | 0 | const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound); | 1338 | |
| 1339 | 0 | int d = abs(signCoef) - (unQuantLevel >> unquantShift); | 1340 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n"); | 1341 | |
| 1342 | 0 | const int64_t origDist = (((int64_t)d * d)); | 1343 | |
| 1344 | 0 | #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8)) | 1345 | |
| 1346 | 0 | const uint32_t isOne = (absLevel == 1); | 1347 | 0 | if (dstCoeff[blkPos]) | 1348 | 0 | { | 1349 | 0 | d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift); | 1350 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1351 | 0 | int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]); | 1352 | | | 1353 | | /* if decrementing would make the coeff 0, we can include the | 1354 | | * significant coeff flag cost savings */ | 1355 | 0 | d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift); | 1356 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); | 1357 | 0 | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); | 1358 | 0 | int64_t costDown = DELTARDCOST(origDist, d, downBits); | 1359 | |
| 1360 | 0 | costDown -= lastCoeffAdjust; | 1361 | 0 | curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown; | 1362 | |
| 1363 | 0 | curChange = 2 * (costUp < costDown) - 1; | 1364 | 0 | curCost = (costUp < costDown) ? costUp : curCost; | 1365 | 0 | } | 1366 | | //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31))) | 1367 | 0 | else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0)) | 1368 | 0 | { | 1369 | | /* don't try to make a new coded coeff before the first coeff if its | 1370 | | * sign would be different than the first coeff, the inferred sign would | 1371 | | * still be wrong and we'd have to do this again. */ | 1372 | 0 | curCost = MAX_INT64; | 1373 | 0 | } | 1374 | 0 | else | 1375 | 0 | { | 1376 | | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ | 1377 | 0 | d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift); | 1378 | 0 | X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n"); | 1379 | 0 | curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); | 1380 | 0 | curChange = 1; | 1381 | 0 | } | 1382 | |
| 1383 | 0 | if (curCost < minCostInc) | 1384 | 0 | { | 1385 | 0 | minCostInc = curCost; | 1386 | 0 | finalChange = (int8_t)curChange; | 1387 | 0 | minPos = blkPos + (absLevel << 16); | 1388 | 0 | } | 1389 | 0 | lastCoeffAdjust = 0; | 1390 | 0 | } | 1391 | |
| 1392 | 0 | const int absInMinPos = (minPos >> 16); | 1393 | 0 | minPos = (uint16_t)minPos; | 1394 | | | 1395 | | // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) | 1396 | 0 | if (absInMinPos >= 32767) | 1397 | | /* don't allow sign hiding to violate the SPEC range */ | 1398 | 0 | finalChange = -1; | 1399 | | | 1400 | | // NOTE: Reference code | 1401 | | //if (dstCoeff[minPos] == 0) | 1402 | | // numSig++; | 1403 | | //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) | 1404 | | // numSig--; | 1405 | 0 | numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1)); | 1406 | | | 1407 | | | 1408 | | // NOTE: Reference code | 1409 | | //if (m_resiDctCoeff[minPos] >= 0) | 1410 | | // dstCoeff[minPos] += finalChange; | 1411 | | //else | 1412 | | // dstCoeff[minPos] -= finalChange; | 1413 | 0 | const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16); | 1414 | 0 | dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign); | 1415 | 0 | } | 1416 | 0 | } | 1417 | |
| 1418 | 0 | lastCG = 0; | 1419 | 0 | } | 1420 | 0 | } | 1421 | | | 1422 | 2.61k | return numSig; | 1423 | 25.2k | } |
|
1424 | | |
1425 | | /* Context derivation process of coeff_abs_significant_flag */ |
1426 | | uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma, |
1427 | | uint32_t firstSignificanceMapContext) |
1428 | 0 | { |
1429 | 0 | static const uint8_t ctxIndMap[16] = |
1430 | 0 | { |
1431 | 0 | 0, 1, 4, 5, |
1432 | 0 | 2, 3, 4, 5, |
1433 | 0 | 6, 6, 8, 8, |
1434 | 0 | 7, 7, 8, 8 |
1435 | 0 | }; |
1436 | |
|
1437 | 0 | if (!blkPos) // special case for the DC context variable |
1438 | 0 | return 0; |
1439 | | |
1440 | 0 | if (log2TrSize == 2) // 4x4 |
1441 | 0 | return ctxIndMap[blkPos]; |
1442 | | |
1443 | 0 | const uint32_t posY = blkPos >> log2TrSize; |
1444 | 0 | const uint32_t posX = blkPos & (trSize - 1); |
1445 | 0 | X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n"); |
1446 | |
|
1447 | 0 | int posXinSubset = blkPos & 3; |
1448 | 0 | X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n"); |
1449 | 0 | int posYinSubset = posY & 3; |
1450 | | |
1451 | | // NOTE: [patternSigCtx][posXinSubset][posYinSubset] |
1452 | 0 | static const uint8_t table_cnt[4][4][4] = |
1453 | 0 | { |
1454 | | // patternSigCtx = 0 |
1455 | 0 | { |
1456 | 0 | { 2, 1, 1, 0 }, |
1457 | 0 | { 1, 1, 0, 0 }, |
1458 | 0 | { 1, 0, 0, 0 }, |
1459 | 0 | { 0, 0, 0, 0 }, |
1460 | 0 | }, |
1461 | | // patternSigCtx = 1 |
1462 | 0 | { |
1463 | 0 | { 2, 1, 0, 0 }, |
1464 | 0 | { 2, 1, 0, 0 }, |
1465 | 0 | { 2, 1, 0, 0 }, |
1466 | 0 | { 2, 1, 0, 0 }, |
1467 | 0 | }, |
1468 | | // patternSigCtx = 2 |
1469 | 0 | { |
1470 | 0 | { 2, 2, 2, 2 }, |
1471 | 0 | { 1, 1, 1, 1 }, |
1472 | 0 | { 0, 0, 0, 0 }, |
1473 | 0 | { 0, 0, 0, 0 }, |
1474 | 0 | }, |
1475 | | // patternSigCtx = 3 |
1476 | 0 | { |
1477 | 0 | { 2, 2, 2, 2 }, |
1478 | 0 | { 2, 2, 2, 2 }, |
1479 | 0 | { 2, 2, 2, 2 }, |
1480 | 0 | { 2, 2, 2, 2 }, |
1481 | 0 | } |
1482 | 0 | }; |
1483 | |
|
1484 | 0 | int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset]; |
1485 | 0 | int offset = firstSignificanceMapContext; |
1486 | |
|
1487 | 0 | offset += cnt; |
1488 | |
|
1489 | 0 | return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset; |
1490 | 0 | } |
1491 | | |