/src/x265/source/common/quant.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * Min Chen <chenm003@163.com> |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 2 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
20 | | * |
21 | | * This program is also available under a commercial proprietary license. |
22 | | * For more information, contact us at license @ x265.com. |
23 | | *****************************************************************************/ |
24 | | |
25 | | #include "common.h" |
26 | | #include "primitives.h" |
27 | | #include "quant.h" |
28 | | #include "framedata.h" |
29 | | #include "entropy.h" |
30 | | #include "yuv.h" |
31 | | #include "cudata.h" |
32 | | #include "contexts.h" |
33 | | |
34 | | using namespace X265_NS; |
35 | | |
36 | 0 | #define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) |
37 | | |
38 | | namespace { |
39 | | |
40 | | struct coeffGroupRDStats |
41 | | { |
42 | | int nnzBeforePos0; /* indicates coeff other than pos 0 are coded */ |
43 | | int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */ |
44 | | int64_t uncodedDist; /* uncoded distortion cost of coded coefficients */ |
45 | | int64_t sigCost; /* cost of signaling significant coeff bitmap */ |
46 | | int64_t sigCost0; /* cost of signaling sig coeff bit of coeff 0 */ |
47 | | }; |
48 | | |
49 | | inline int fastMin(int x, int y) |
50 | 0 | { |
51 | 0 | return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) |
52 | 0 | } |
53 | | |
54 | | inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate) |
55 | 0 | { |
56 | 0 | X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); |
57 | 0 | if (!absLevel) |
58 | 0 | { |
59 | 0 | X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); |
60 | 0 | return 0; |
61 | 0 | } |
62 | 0 | int rate = 0; |
63 | |
|
64 | 0 | if (diffLevel < 0) |
65 | 0 | { |
66 | 0 | X265_CHECK(absLevel <= 2, "absLevel check failure\n"); |
67 | 0 | rate += greaterOneBits[(absLevel == 2)]; |
68 | |
|
69 | 0 | if (absLevel == 2) |
70 | 0 | rate += levelAbsBits[0]; |
71 | 0 | } |
72 | 0 | else |
73 | 0 | { |
74 | 0 | uint32_t symbol = diffLevel; |
75 | 0 | bool expGolomb = (symbol > maxVlc); |
76 | |
|
77 | 0 | if (expGolomb) |
78 | 0 | { |
79 | 0 | absLevel = symbol - maxVlc; |
80 | | |
81 | | // NOTE: mapping to x86 hardware instruction BSR |
82 | 0 | unsigned long size; |
83 | 0 | CLZ(size, absLevel); |
84 | 0 | int egs = size * 2 + 1; |
85 | |
|
86 | 0 | rate += egs << 15; |
87 | | |
88 | | // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1) |
89 | 0 | X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n"); |
90 | 0 | symbol = maxVlc + 1; |
91 | 0 | } |
92 | |
|
93 | 0 | uint32_t prefLen = (symbol >> absGoRice) + 1; |
94 | 0 | uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); |
95 | |
|
96 | 0 | rate += numBins << 15; |
97 | 0 | rate += c1c2Rate; |
98 | 0 | } |
99 | 0 | return rate; |
100 | 0 | } |
101 | | |
102 | | #if CHECKED_BUILD || _DEBUG |
103 | | inline int getICRateNegDiff(uint32_t absLevel, const int* greaterOneBits, const int* levelAbsBits) |
104 | | { |
105 | | X265_CHECK(absLevel <= 2, "absLevel check failure\n"); |
106 | | |
107 | | int rate; |
108 | | if (absLevel == 0) |
109 | | rate = 0; |
110 | | else if (absLevel == 2) |
111 | | rate = greaterOneBits[1] + levelAbsBits[0]; |
112 | | else |
113 | | rate = greaterOneBits[0]; |
114 | | return rate; |
115 | | } |
116 | | #endif |
117 | | |
118 | | inline int getICRateLessVlc(uint32_t absLevel, int32_t diffLevel, const uint32_t absGoRice) |
119 | 0 | { |
120 | 0 | X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); |
121 | 0 | if (!absLevel) |
122 | 0 | { |
123 | 0 | X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); |
124 | 0 | return 0; |
125 | 0 | } |
126 | 0 | int rate; |
127 | |
|
128 | 0 | uint32_t symbol = diffLevel; |
129 | 0 | uint32_t prefLen = (symbol >> absGoRice) + 1; |
130 | 0 | uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); |
131 | |
|
132 | 0 | rate = numBins << 15; |
133 | |
|
134 | 0 | return rate; |
135 | 0 | } |
136 | | |
137 | | /* Calculates the cost for specific absolute transform level */ |
138 | | inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate) |
139 | 0 | { |
140 | 0 | X265_CHECK(absLevel, "absLevel should not be zero\n"); |
141 | |
|
142 | 0 | if (diffLevel < 0) |
143 | 0 | { |
144 | 0 | X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n"); |
145 | |
|
146 | 0 | uint32_t rate = greaterOneBits[(absLevel == 2)]; |
147 | 0 | if (absLevel == 2) |
148 | 0 | rate += levelAbsBits[0]; |
149 | 0 | return rate; |
150 | 0 | } |
151 | 0 | else |
152 | 0 | { |
153 | 0 | uint32_t rate; |
154 | 0 | uint32_t symbol = diffLevel; |
155 | 0 | if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION) |
156 | 0 | { |
157 | 0 | uint32_t length = symbol >> absGoRice; |
158 | 0 | rate = (length + 1 + absGoRice) << 15; |
159 | 0 | } |
160 | 0 | else |
161 | 0 | { |
162 | 0 | uint32_t length = 0; |
163 | 0 | symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION; |
164 | 0 | if (symbol) |
165 | 0 | { |
166 | 0 | unsigned long idx; |
167 | 0 | CLZ(idx, symbol + 1); |
168 | 0 | length = idx; |
169 | 0 | } |
170 | |
|
171 | 0 | rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15; |
172 | 0 | } |
173 | 0 | rate += c1c2Rate; |
174 | 0 | return rate; |
175 | 0 | } |
176 | 0 | } |
177 | | |
178 | | } |
179 | | |
180 | | Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>}; |
181 | | |
182 | | Quant::Quant() |
183 | 0 | { |
184 | 0 | m_resiDctCoeff = NULL; |
185 | 0 | m_fencDctCoeff = NULL; |
186 | 0 | m_fencShortBuf = NULL; |
187 | 0 | m_frameNr = NULL; |
188 | 0 | m_nr = NULL; |
189 | 0 | } |
190 | | |
191 | | bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy) |
192 | 0 | { |
193 | 0 | m_entropyCoder = &entropy; |
194 | 0 | m_psyRdoqScale = (int32_t)(psyScale * 256.0); |
195 | 0 | X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n"); |
196 | 0 | m_scalingList = &scalingList; |
197 | 0 | m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); |
198 | 0 | m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); |
199 | 0 | m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); |
200 | |
|
201 | 0 | return m_resiDctCoeff && m_fencShortBuf; |
202 | 0 | } |
203 | | |
204 | | bool Quant::allocNoiseReduction(const x265_param& param) |
205 | 0 | { |
206 | 0 | m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads); |
207 | 0 | if (m_frameNr) |
208 | 0 | memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads); |
209 | 0 | else |
210 | 0 | return false; |
211 | 0 | return true; |
212 | 0 | } |
213 | | |
214 | | Quant::~Quant() |
215 | 0 | { |
216 | 0 | X265_FREE(m_frameNr); |
217 | 0 | X265_FREE(m_resiDctCoeff); |
218 | 0 | X265_FREE(m_fencShortBuf); |
219 | 0 | } |
220 | | |
221 | | void Quant::setQPforQuant(const CUData& ctu, int qp) |
222 | 0 | { |
223 | 0 | m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL; |
224 | 0 | m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET); |
225 | 0 | m_rdoqLevel = ctu.m_encData->m_param->rdoqLevel; |
226 | 0 | if (ctu.m_chromaFormat != X265_CSP_I400) |
227 | 0 | { |
228 | 0 | setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0] + ctu.m_slice->m_chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat); |
229 | 0 | setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1] + ctu.m_slice->m_chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat); |
230 | 0 | } |
231 | 0 | } |
232 | | |
233 | | void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) |
234 | 0 | { |
235 | 0 | int qp = x265_clip3(-QP_BD_OFFSET, 57, qpin); |
236 | 0 | if (qp >= 30) |
237 | 0 | { |
238 | 0 | if (chFmt == X265_CSP_I420) |
239 | 0 | qp = g_chromaScale[qp]; |
240 | 0 | else |
241 | 0 | qp = X265_MIN(qp, QP_MAX_SPEC); |
242 | 0 | } |
243 | 0 | m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET); |
244 | 0 | } |
245 | | |
246 | | /* To minimize the distortion only. No rate is considered */ |
247 | | uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams, uint32_t log2TrSize) |
248 | 0 | { |
249 | 0 | uint32_t trSize = 1 << log2TrSize; |
250 | 0 | const uint16_t* scan = codeParams.scan; |
251 | |
|
252 | 0 | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] |
253 | 0 | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign |
254 | 0 | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff |
255 | |
|
256 | | #if CHECKED_BUILD || _DEBUG |
257 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group |
258 | | memset(coeffNum, 0, sizeof(coeffNum)); |
259 | | memset(coeffSign, 0, sizeof(coeffNum)); |
260 | | memset(coeffFlag, 0, sizeof(coeffNum)); |
261 | | #endif |
262 | 0 | const int lastScanPos = primitives.scanPosLast(codeParams.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); |
263 | 0 | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); |
264 | 0 | unsigned long tmp; |
265 | | |
266 | | // first CG need specially processing |
267 | 0 | const uint32_t correctOffset = 0x0F & (lastScanPos ^ 0xF); |
268 | 0 | coeffFlag[cgLastScanPos] <<= correctOffset; |
269 | |
|
270 | 0 | for (int cg = cgLastScanPos; cg >= 0; cg--) |
271 | 0 | { |
272 | 0 | int cgStartPos = cg << LOG2_SCAN_SET_SIZE; |
273 | 0 | int n; |
274 | |
|
275 | | #if CHECKED_BUILD || _DEBUG |
276 | | for (n = SCAN_SET_SIZE - 1; n >= 0; --n) |
277 | | if (coeff[scan[n + cgStartPos]]) |
278 | | break; |
279 | | int lastNZPosInCG0 = n; |
280 | | #endif |
281 | |
|
282 | 0 | if (coeffNum[cg] == 0) |
283 | 0 | { |
284 | 0 | X265_CHECK(lastNZPosInCG0 < 0, "all zero block check failure\n"); |
285 | 0 | continue; |
286 | 0 | } |
287 | | |
288 | | #if CHECKED_BUILD || _DEBUG |
289 | | for (n = 0;; n++) |
290 | | if (coeff[scan[n + cgStartPos]]) |
291 | | break; |
292 | | |
293 | | int firstNZPosInCG0 = n; |
294 | | #endif |
295 | | |
296 | 0 | CLZ(tmp, coeffFlag[cg]); |
297 | 0 | const int firstNZPosInCG = (15 ^ tmp); |
298 | |
|
299 | 0 | CTZ(tmp, coeffFlag[cg]); |
300 | 0 | const int lastNZPosInCG = (15 ^ tmp); |
301 | |
|
302 | 0 | X265_CHECK(firstNZPosInCG0 == firstNZPosInCG, "firstNZPosInCG0 check failure\n"); |
303 | 0 | X265_CHECK(lastNZPosInCG0 == lastNZPosInCG, "lastNZPosInCG0 check failure\n"); |
304 | |
|
305 | 0 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) |
306 | 0 | { |
307 | 0 | uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1; |
308 | 0 | uint32_t absSum = 0; |
309 | |
|
310 | 0 | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) |
311 | 0 | absSum += coeff[scan[n + cgStartPos]]; |
312 | |
|
313 | 0 | if (signbit != (absSum & 0x1)) // compare signbit with sum_parity |
314 | 0 | { |
315 | 0 | int minCostInc = MAX_INT, minPos = -1, curCost = MAX_INT; |
316 | 0 | int32_t finalChange = 0, curChange = 0; |
317 | 0 | uint32_t cgFlags = coeffFlag[cg]; |
318 | 0 | if (cg == cgLastScanPos) |
319 | 0 | cgFlags >>= correctOffset; |
320 | |
|
321 | 0 | for (n = (cg == cgLastScanPos ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) |
322 | 0 | { |
323 | 0 | uint32_t blkPos = scan[n + cgStartPos]; |
324 | 0 | X265_CHECK(!!coeff[blkPos] == !!(cgFlags & 1), "non zero coeff check failure\n"); |
325 | |
|
326 | 0 | if (cgFlags & 1) |
327 | 0 | { |
328 | 0 | if (deltaU[blkPos] > 0) |
329 | 0 | { |
330 | 0 | curCost = -deltaU[blkPos]; |
331 | 0 | curChange = 1; |
332 | 0 | } |
333 | 0 | else |
334 | 0 | { |
335 | 0 | if ((cgFlags == 1) && (abs(coeff[blkPos]) == 1)) |
336 | 0 | { |
337 | 0 | X265_CHECK(n == firstNZPosInCG, "firstNZPosInCG position check failure\n"); |
338 | 0 | curCost = MAX_INT; |
339 | 0 | } |
340 | 0 | else |
341 | 0 | { |
342 | 0 | curCost = deltaU[blkPos]; |
343 | 0 | curChange = -1; |
344 | 0 | } |
345 | 0 | } |
346 | 0 | } |
347 | 0 | else |
348 | 0 | { |
349 | 0 | if (cgFlags == 0) |
350 | 0 | { |
351 | 0 | X265_CHECK(n < firstNZPosInCG, "firstNZPosInCG position check failure\n"); |
352 | 0 | uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1; |
353 | 0 | if (thisSignBit != signbit) |
354 | 0 | curCost = MAX_INT; |
355 | 0 | else |
356 | 0 | { |
357 | 0 | curCost = -deltaU[blkPos]; |
358 | 0 | curChange = 1; |
359 | 0 | } |
360 | 0 | } |
361 | 0 | else |
362 | 0 | { |
363 | 0 | curCost = -deltaU[blkPos]; |
364 | 0 | curChange = 1; |
365 | 0 | } |
366 | 0 | } |
367 | |
|
368 | 0 | if (curCost < minCostInc) |
369 | 0 | { |
370 | 0 | minCostInc = curCost; |
371 | 0 | finalChange = curChange; |
372 | 0 | minPos = blkPos; |
373 | 0 | } |
374 | 0 | cgFlags>>=1; |
375 | 0 | } |
376 | | |
377 | | /* do not allow change to violate coeff clamp */ |
378 | 0 | if (coeff[minPos] == 32767 || coeff[minPos] == -32768) |
379 | 0 | finalChange = -1; |
380 | |
|
381 | 0 | if (!coeff[minPos]) |
382 | 0 | numSig++; |
383 | 0 | else if (finalChange == -1 && abs(coeff[minPos]) == 1) |
384 | 0 | numSig--; |
385 | |
|
386 | 0 | { |
387 | 0 | const int16_t sigMask = ((int16_t)m_resiDctCoeff[minPos]) >> 15; |
388 | 0 | coeff[minPos] += ((int16_t)finalChange ^ sigMask) - sigMask; |
389 | 0 | } |
390 | 0 | } |
391 | 0 | } |
392 | 0 | } |
393 | |
|
394 | 0 | return numSig; |
395 | 0 | } |
396 | | |
397 | | uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, |
398 | | coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip) |
399 | 0 | { |
400 | 0 | const uint32_t sizeIdx = log2TrSize - 2; |
401 | |
|
402 | 0 | if (cu.m_tqBypass[0]) |
403 | 0 | { |
404 | 0 | X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n"); |
405 | 0 | return primitives.cu[sizeIdx].copy_cnt(coeff, residual, resiStride); |
406 | 0 | } |
407 | | |
408 | 0 | bool isLuma = ttype == TEXT_LUMA; |
409 | 0 | bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip; |
410 | 0 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform |
411 | |
|
412 | 0 | X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n"); |
413 | 0 | if (useTransformSkip) |
414 | 0 | { |
415 | 0 | #if X265_DEPTH <= 10 |
416 | 0 | X265_CHECK(transformShift >= 0, "invalid transformShift\n"); |
417 | 0 | primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift); |
418 | | #else |
419 | | if (transformShift >= 0) |
420 | | primitives.cu[sizeIdx].cpy2Dto1D_shl(m_resiDctCoeff, residual, resiStride, transformShift); |
421 | | else |
422 | | primitives.cu[sizeIdx].cpy2Dto1D_shr(m_resiDctCoeff, residual, resiStride, -transformShift); |
423 | | #endif |
424 | 0 | } |
425 | 0 | else |
426 | 0 | { |
427 | 0 | bool isIntra = cu.isIntra(absPartIdx); |
428 | |
|
429 | 0 | if (!sizeIdx && isLuma && isIntra) |
430 | 0 | primitives.dst4x4(residual, m_resiDctCoeff, resiStride); |
431 | 0 | else |
432 | 0 | primitives.cu[sizeIdx].dct(residual, m_resiDctCoeff, resiStride); |
433 | | |
434 | | /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so |
435 | | * there is no risk of performing this DCT unnecessarily */ |
436 | 0 | if (usePsy) |
437 | 0 | { |
438 | 0 | int trSize = 1 << log2TrSize; |
439 | | /* perform DCT on source pixels for psy-rdoq */ |
440 | 0 | primitives.cu[sizeIdx].copy_ps(m_fencShortBuf, trSize, fenc, fencStride); |
441 | 0 | primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize); |
442 | 0 | } |
443 | |
|
444 | 0 | if (m_nr && m_nr->offset) |
445 | 0 | { |
446 | | /* denoise is not applied to intra residual, so DST can be ignored */ |
447 | 0 | int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra; |
448 | 0 | int numCoeff = 1 << (log2TrSize * 2); |
449 | 0 | primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff); |
450 | 0 | m_nr->count[cat]++; |
451 | 0 | } |
452 | 0 | } |
453 | |
|
454 | 0 | if (m_rdoqLevel) |
455 | 0 | return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy); |
456 | 0 | else |
457 | 0 | { |
458 | 0 | int deltaU[32 * 32]; |
459 | |
|
460 | 0 | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; |
461 | 0 | int rem = m_qpParam[ttype].rem; |
462 | 0 | int per = m_qpParam[ttype].per; |
463 | 0 | const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; |
464 | |
|
465 | 0 | int qbits = QUANT_SHIFT + per + transformShift; |
466 | 0 | int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9); |
467 | 0 | int numCoeff = 1 << (log2TrSize * 2); |
468 | |
|
469 | 0 | uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff); |
470 | |
|
471 | 0 | if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled) |
472 | 0 | { |
473 | 0 | TUEntropyCodingParameters codeParams; |
474 | 0 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma); |
475 | 0 | return signBitHidingHDQ(coeff, deltaU, numSig, codeParams, log2TrSize); |
476 | 0 | } |
477 | 0 | else |
478 | 0 | return numSig; |
479 | 0 | } |
480 | 0 | } |
481 | | |
482 | | uint64_t Quant::ssimDistortion(const CUData& cu, const pixel* fenc, uint32_t fStride, const pixel* recon, intptr_t rstride, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx) |
483 | 0 | { |
484 | 0 | static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); // 416 |
485 | 0 | static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); // 235963 |
486 | 0 | int shift = (X265_DEPTH - 8); |
487 | |
|
488 | 0 | int trSize = 1 << log2TrSize; |
489 | 0 | uint64_t ssDc = 0, ssBlock = 0, ssAc = 0; |
490 | | |
491 | | // Calculation of (X(0) - Y(0)) * (X(0) - Y(0)), DC |
492 | 0 | ssDc = 0; |
493 | 0 | for (int y = 0; y < trSize; y += 4) |
494 | 0 | { |
495 | 0 | for (int x = 0; x < trSize; x += 4) |
496 | 0 | { |
497 | 0 | int temp = fenc[y * fStride + x] - recon[y * rstride + x]; // copy of residual coeff |
498 | 0 | ssDc += temp * temp; |
499 | 0 | } |
500 | 0 | } |
501 | | |
502 | | // Calculation of (X(k) - Y(k)) * (X(k) - Y(k)), AC |
503 | 0 | ssBlock = 0; |
504 | 0 | uint64_t ac_k = 0; |
505 | 0 | primitives.cu[log2TrSize - 2].ssimDist(fenc, fStride, recon, rstride, &ssBlock, shift, &ac_k); |
506 | 0 | ssAc = ssBlock - ssDc; |
507 | | |
508 | | // 1. Calculation of fdc' |
509 | | // Calculate numerator of dc normalization factor |
510 | 0 | uint64_t fDc_num = 0; |
511 | | |
512 | | // 2. Calculate dc component |
513 | 0 | uint64_t dc_k = 0; |
514 | 0 | for (int block_yy = 0; block_yy < trSize; block_yy += 4) |
515 | 0 | { |
516 | 0 | for (int block_xx = 0; block_xx < trSize; block_xx += 4) |
517 | 0 | { |
518 | 0 | uint32_t temp = fenc[block_yy * fStride + block_xx] >> shift; |
519 | 0 | dc_k += temp * temp; |
520 | 0 | } |
521 | 0 | } |
522 | |
|
523 | 0 | fDc_num = (2 * dc_k) + (trSize * trSize * ssim_c1); // 16 pixels -> for each 4x4 block |
524 | 0 | fDc_num /= ((trSize >> 2) * (trSize >> 2)); |
525 | | |
526 | | // 1. Calculation of fac' |
527 | | // Calculate numerator of ac normalization factor |
528 | 0 | uint64_t fAc_num = 0; |
529 | | |
530 | | // 2. Calculate ac component |
531 | 0 | ac_k -= dc_k; |
532 | |
|
533 | 0 | double s = 1 + 0.005 * cu.m_qp[absPartIdx]; |
534 | |
|
535 | 0 | fAc_num = ac_k + uint64_t(s * ac_k) + ssim_c2; |
536 | 0 | fAc_num /= ((trSize >> 2) * (trSize >> 2)); |
537 | | |
538 | | // Calculate dc and ac normalization factor |
539 | 0 | uint64_t ssim_distortion = ((ssDc * cu.m_fDc_den[ttype]) / fDc_num) + ((ssAc * cu.m_fAc_den[ttype]) / fAc_num); |
540 | 0 | return ssim_distortion; |
541 | 0 | } |
542 | | |
543 | | void Quant::invtransformNxN(const CUData& cu, int16_t* residual, uint32_t resiStride, const coeff_t* coeff, |
544 | | uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) |
545 | 0 | { |
546 | 0 | const uint32_t sizeIdx = log2TrSize - 2; |
547 | 0 | if (cu.m_tqBypass[0]) |
548 | 0 | { |
549 | 0 | primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0); |
550 | 0 | return; |
551 | 0 | } |
552 | | // Values need to pass as input parameter in dequant |
553 | 0 | int rem = m_qpParam[ttype].rem; |
554 | 0 | int per = m_qpParam[ttype].per; |
555 | 0 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; |
556 | 0 | int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; |
557 | 0 | int numCoeff = 1 << (log2TrSize * 2); |
558 | |
|
559 | 0 | if (m_scalingList->m_bEnabled) |
560 | 0 | { |
561 | 0 | int scalingListType = (bIntra ? 0 : 3) + ttype; |
562 | 0 | const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem]; |
563 | 0 | primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift); |
564 | 0 | } |
565 | 0 | else |
566 | 0 | { |
567 | 0 | int scale = m_scalingList->s_invQuantScales[rem] << per; |
568 | 0 | primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift); |
569 | 0 | } |
570 | |
|
571 | 0 | if (useTransformSkip) |
572 | 0 | { |
573 | 0 | #if X265_DEPTH <= 10 |
574 | 0 | X265_CHECK(transformShift > 0, "invalid transformShift\n"); |
575 | 0 | primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift); |
576 | | #else |
577 | | if (transformShift > 0) |
578 | | primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift); |
579 | | else |
580 | | primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift); |
581 | | #endif |
582 | 0 | } |
583 | 0 | else |
584 | 0 | { |
585 | 0 | int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; |
586 | 0 | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n"); |
587 | | // DC only |
588 | 0 | if (numSig == 1 && coeff[0] != 0 && !useDST) |
589 | 0 | { |
590 | 0 | const int shift_1st = 7 - 6; |
591 | 0 | const int add_1st = 1 << (shift_1st - 1); |
592 | 0 | const int shift_2nd = 12 - (X265_DEPTH - 8) - 3; |
593 | 0 | const int add_2nd = 1 << (shift_2nd - 1); |
594 | |
|
595 | 0 | int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd; |
596 | 0 | primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val); |
597 | 0 | return; |
598 | 0 | } |
599 | | |
600 | 0 | if (useDST) |
601 | 0 | primitives.idst4x4(m_resiDctCoeff, residual, resiStride); |
602 | 0 | else |
603 | 0 | primitives.cu[sizeIdx].idct(m_resiDctCoeff, residual, resiStride); |
604 | 0 | } |
605 | 0 | } |
606 | | |
607 | | /* Rate distortion optimized quantization for entropy coding engines using |
608 | | * probability models like CABAC */ |
609 | | template<uint32_t log2TrSize> |
610 | | uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy) |
611 | 0 | { |
612 | 0 | const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ |
613 | 0 | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; |
614 | 0 | const uint32_t usePsyMask = usePsy ? -1 : 0; |
615 | |
|
616 | 0 | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); |
617 | |
|
618 | 0 | int rem = m_qpParam[ttype].rem; |
619 | 0 | int per = m_qpParam[ttype].per; |
620 | 0 | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ |
621 | 0 | int add = (1 << (qbits - 1)); |
622 | 0 | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; |
623 | |
|
624 | 0 | const int numCoeff = 1 << (log2TrSize * 2); |
625 | 0 | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); |
626 | 0 | X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); |
627 | 0 | if (!numSig) |
628 | 0 | return 0; |
629 | 0 | const uint32_t trSize = 1 << log2TrSize; |
630 | 0 | int64_t lambda2 = m_qpParam[ttype].lambda2; |
631 | 0 | int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda); |
632 | | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) |
633 | | * scale applied that must be removed during unquant. Note that in real dequant there is clipping |
634 | | * at several stages. We skip the clipping for simplicity when measuring RD cost */ |
635 | 0 | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; |
636 | 0 | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); |
637 | 0 | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; |
638 | 0 | const int scaleBits = SCALE_BITS - 2 * transformShift; |
639 | |
|
640 | 0 | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) |
641 | 0 | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) |
642 | 0 | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) |
643 | 0 | #define PSYVALUE(rec) ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1))) |
644 | |
|
645 | 0 | int64_t costCoeff[trSize * trSize]; /* d*d + lambda * bits */ |
646 | 0 | int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0 */ |
647 | 0 | int64_t costSig[trSize * trSize]; /* lambda * bits */ |
648 | |
|
649 | 0 | int rateIncUp[trSize * trSize]; /* signal overhead of increasing level */ |
650 | 0 | int rateIncDown[trSize * trSize]; /* signal overhead of decreasing level */ |
651 | 0 | int sigRateDelta[trSize * trSize]; /* signal difference between zero and non-zero */ |
652 | |
|
653 | 0 | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ |
654 | 0 | uint64_t sigCoeffGroupFlag64 = 0; |
655 | |
|
656 | 0 | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ |
657 | 0 | bool bIsLuma = ttype == TEXT_LUMA; |
658 | | |
659 | | /* total rate distortion cost of transform block, as CBF=0 */ |
660 | 0 | int64_t totalUncodedCost = 0; |
661 | | |
662 | | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, |
663 | | * the distortion and signal cost of coded blocks, and the coding cost of significant |
664 | | * coefficient and coefficient group bitmaps */ |
665 | 0 | int64_t totalRdCost = 0; |
666 | |
|
667 | 0 | TUEntropyCodingParameters codeParams; |
668 | 0 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); |
669 | 0 | const uint32_t log2TrSizeCG = log2TrSize - 2; |
670 | 0 | const uint32_t cgNum = 1 << (log2TrSizeCG * 2); |
671 | 0 | const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE); |
672 | |
|
673 | 0 | uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] |
674 | 0 | uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign |
675 | 0 | uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff |
676 | |
|
677 | | #if CHECKED_BUILD || _DEBUG |
678 | | // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group |
679 | | memset(coeffNum, 0, sizeof(coeffNum)); |
680 | | memset(coeffSign, 0, sizeof(coeffNum)); |
681 | | memset(coeffFlag, 0, sizeof(coeffNum)); |
682 | | #endif |
683 | 0 | const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize); |
684 | 0 | const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE); |
685 | | |
686 | | |
687 | | /* TODO: update bit estimates if dirty */ |
688 | 0 | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; |
689 | |
|
690 | 0 | uint32_t scanPos = 0; |
691 | 0 | uint32_t c1 = 1; |
692 | | |
693 | | // process trail all zero Coeff Group |
694 | | |
695 | | /* coefficients after lastNZ have no distortion signal cost */ |
696 | 0 | const int zeroCG = cgNum - 1 - cgLastScanPos; |
697 | 0 | memset(&costCoeff[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); |
698 | 0 | memset(&costSig[(cgLastScanPos + 1) << MLS_CG_SIZE], 0, zeroCG * MLS_CG_BLK_SIZE * sizeof(int64_t)); |
699 | | |
700 | | /* sum zero coeff (uncodec) cost */ |
701 | | |
702 | | // TODO: does we need these cost? |
703 | 0 | if (usePsyMask) |
704 | 0 | { |
705 | 0 | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) |
706 | 0 | { |
707 | 0 | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); |
708 | 0 | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); |
709 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; |
710 | 0 | #if X265_ARCH_X86 |
711 | 0 | bool enable512 = detect512(); |
712 | 0 | if (enable512) |
713 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
714 | 0 | else |
715 | 0 | { |
716 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos); |
717 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
718 | 0 | } |
719 | | #else |
720 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
721 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
722 | | #endif |
723 | 0 | } |
724 | 0 | } |
725 | 0 | else |
726 | 0 | { |
727 | | // non-psy path |
728 | 0 | for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++) |
729 | 0 | { |
730 | 0 | X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n"); |
731 | 0 | uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); |
732 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; |
733 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
734 | 0 | } |
735 | 0 | } |
736 | 0 | static const uint8_t table_cnt[5][SCAN_SET_SIZE] = |
737 | 0 | { |
738 | | // patternSigCtx = 0 |
739 | 0 | { |
740 | 0 | 2, 1, 1, 0, |
741 | 0 | 1, 1, 0, 0, |
742 | 0 | 1, 0, 0, 0, |
743 | 0 | 0, 0, 0, 0, |
744 | 0 | }, |
745 | | // patternSigCtx = 1 |
746 | 0 | { |
747 | 0 | 2, 2, 2, 2, |
748 | 0 | 1, 1, 1, 1, |
749 | 0 | 0, 0, 0, 0, |
750 | 0 | 0, 0, 0, 0, |
751 | 0 | }, |
752 | | // patternSigCtx = 2 |
753 | 0 | { |
754 | 0 | 2, 1, 0, 0, |
755 | 0 | 2, 1, 0, 0, |
756 | 0 | 2, 1, 0, 0, |
757 | 0 | 2, 1, 0, 0, |
758 | 0 | }, |
759 | | // patternSigCtx = 3 |
760 | 0 | { |
761 | 0 | 2, 2, 2, 2, |
762 | 0 | 2, 2, 2, 2, |
763 | 0 | 2, 2, 2, 2, |
764 | 0 | 2, 2, 2, 2, |
765 | 0 | }, |
766 | | // 4x4 |
767 | 0 | { |
768 | 0 | 0, 1, 4, 5, |
769 | 0 | 2, 3, 4, 5, |
770 | 0 | 6, 6, 8, 8, |
771 | 0 | 7, 7, 8, 8 |
772 | 0 | } |
773 | 0 | }; |
774 | | |
775 | | /* iterate over coding groups in reverse scan order */ |
776 | 0 | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0; cgScanPos--) |
777 | 0 | { |
778 | 0 | uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0; |
779 | 0 | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; |
780 | 0 | const uint32_t cgPosY = cgBlkPos >> log2TrSizeCG; |
781 | 0 | const uint32_t cgPosX = cgBlkPos & ((1 << log2TrSizeCG) - 1); |
782 | 0 | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); |
783 | 0 | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
784 | 0 | const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0); |
785 | |
|
786 | 0 | if (c1 == 0) |
787 | 0 | ctxSet++; |
788 | 0 | c1 = 1; |
789 | |
|
790 | 0 | if (cgScanPos && (coeffNum[cgScanPos] == 0)) |
791 | 0 | { |
792 | | // TODO: does we need zero-coeff cost? |
793 | 0 | const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE); |
794 | 0 | uint32_t blkPos = codeParams.scan[scanPosBase]; |
795 | 0 | if (usePsyMask) |
796 | 0 | { |
797 | 0 | #if X265_ARCH_X86 |
798 | 0 | bool enable512 = detect512(); |
799 | 0 | if (enable512) |
800 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
801 | 0 | else |
802 | 0 | { |
803 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
804 | 0 | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
805 | 0 | } |
806 | | #else |
807 | | primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
808 | | primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos); |
809 | | #endif |
810 | 0 | blkPos = codeParams.scan[scanPosBase]; |
811 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
812 | 0 | { |
813 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
814 | 0 | { |
815 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; |
816 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; |
817 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); |
818 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); |
819 | |
|
820 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
821 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; |
822 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
823 | 0 | } |
824 | 0 | blkPos += trSize; |
825 | 0 | } |
826 | 0 | } |
827 | 0 | else |
828 | 0 | { |
829 | | // non-psy path |
830 | 0 | primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos); |
831 | 0 | blkPos = codeParams.scan[scanPosBase]; |
832 | 0 | for (int y = 0; y < MLS_CG_SIZE; y++) |
833 | 0 | { |
834 | 0 | for (int x = 0; x < MLS_CG_SIZE; x++) |
835 | 0 | { |
836 | 0 | const uint32_t scanPosOffset = y * MLS_CG_SIZE + x; |
837 | 0 | const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset; |
838 | 0 | X265_CHECK(trSize > 4, "trSize check failure\n"); |
839 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, codeParams.scan[scanPosBase + scanPosOffset], bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); |
840 | |
|
841 | 0 | costSig[scanPosBase + scanPosOffset] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
842 | 0 | costCoeff[scanPosBase + scanPosOffset] = costUncoded[blkPos + x]; |
843 | 0 | sigRateDelta[blkPos + x] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
844 | 0 | } |
845 | 0 | blkPos += trSize; |
846 | 0 | } |
847 | 0 | } |
848 | | |
849 | | /* there were no coded coefficients in this coefficient group */ |
850 | 0 | { |
851 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
852 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); |
853 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ |
854 | 0 | } |
855 | 0 | continue; |
856 | 0 | } |
857 | | |
858 | 0 | coeffGroupRDStats cgRdStats; |
859 | 0 | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); |
860 | |
|
861 | 0 | uint32_t subFlagMask = coeffFlag[cgScanPos]; |
862 | 0 | int c2 = 0; |
863 | 0 | uint32_t goRiceParam = 0; |
864 | 0 | uint32_t levelThreshold = 3; |
865 | 0 | uint32_t c1Idx = 0; |
866 | 0 | uint32_t c2Idx = 0; |
867 | | /* iterate over coefficients in each group in reverse scan order */ |
868 | 0 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) |
869 | 0 | { |
870 | 0 | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; |
871 | 0 | uint32_t blkPos = codeParams.scan[scanPos]; |
872 | 0 | uint32_t maxAbsLevel = dstCoeff[blkPos]; /* abs(quantized coeff) */ |
873 | 0 | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ |
874 | 0 | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ |
875 | | |
876 | | /* RDOQ measures distortion as the squared difference between the unquantized coded level |
877 | | * and the original DCT coefficient. The result is shifted scaleBits to account for the |
878 | | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ |
879 | | |
880 | | /* cost of not coding this coefficient (all distortion, no signal bits) */ |
881 | 0 | costUncoded[blkPos] = ((int64_t)signCoef * signCoef) << scaleBits; |
882 | 0 | X265_CHECK((!!scanPos ^ !!blkPos) == 0, "failed on (blkPos=0 && scanPos!=0)\n"); |
883 | 0 | if (usePsyMask & scanPos) |
884 | | /* when no residual coefficient is coded, predicted coef == recon coef */ |
885 | 0 | costUncoded[blkPos] -= PSYVALUE(predictedCoef); |
886 | |
|
887 | 0 | totalUncodedCost += costUncoded[blkPos]; |
888 | | |
889 | | // coefficient level estimation |
890 | 0 | const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1]; |
891 | | //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset; |
892 | 0 | static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL}; |
893 | 0 | uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx]; |
894 | 0 | const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset; |
895 | | // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset' |
896 | 0 | X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n"); |
897 | | |
898 | | // before find lastest non-zero coeff |
899 | 0 | if (scanPos > (uint32_t)lastScanPos) |
900 | 0 | { |
901 | | /* coefficients after lastNZ have no distortion signal cost */ |
902 | 0 | costCoeff[scanPos] = 0; |
903 | 0 | costSig[scanPos] = 0; |
904 | | |
905 | | /* No non-zero coefficient yet found, but this does not mean |
906 | | * there is no uncoded-cost for this coefficient. Pre- |
907 | | * quantization the coefficient may have been non-zero */ |
908 | 0 | totalRdCost += costUncoded[blkPos]; |
909 | 0 | } |
910 | 0 | else if (!(subFlagMask & 1)) |
911 | 0 | { |
912 | | // fast zero coeff path |
913 | | /* set default costs to uncoded costs */ |
914 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
915 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; |
916 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
917 | 0 | totalRdCost += costCoeff[scanPos]; |
918 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; |
919 | |
|
920 | 0 | subFlagMask >>= 1; |
921 | 0 | } |
922 | 0 | else |
923 | 0 | { |
924 | 0 | subFlagMask >>= 1; |
925 | |
|
926 | 0 | const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; |
927 | 0 | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3; // {1, 2, 1, 3} |
928 | |
|
929 | 0 | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); |
930 | 0 | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); |
931 | 0 | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); |
932 | 0 | X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n"); |
933 | | |
934 | | // coefficient level estimation |
935 | 0 | const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2]; |
936 | 0 | const uint32_t c1c2Rate = ((c1c2idx & 1) ? greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0); |
937 | |
|
938 | 0 | uint32_t level = 0; |
939 | 0 | uint32_t sigCoefBits = 0; |
940 | 0 | costCoeff[scanPos] = MAX_INT64; |
941 | |
|
942 | 0 | if ((int)scanPos == lastScanPos) |
943 | 0 | sigRateDelta[blkPos] = 0; |
944 | 0 | else |
945 | 0 | { |
946 | 0 | if (maxAbsLevel < 3) |
947 | 0 | { |
948 | | /* set default costs to uncoded costs */ |
949 | 0 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[0][ctxSig]); |
950 | 0 | costCoeff[scanPos] = costUncoded[blkPos] + costSig[scanPos]; |
951 | 0 | } |
952 | 0 | sigRateDelta[blkPos] = estBitsSbac.significantBits[1][ctxSig] - estBitsSbac.significantBits[0][ctxSig]; |
953 | 0 | sigCoefBits = estBitsSbac.significantBits[1][ctxSig]; |
954 | 0 | } |
955 | |
|
956 | 0 | const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound); |
957 | | // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1) | (0 < X < 2 ==> X=1) |
958 | 0 | if (maxAbsLevel == 1) |
959 | 0 | { |
960 | 0 | uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE; |
961 | 0 | X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n"); |
962 | |
|
963 | 0 | int unquantAbsLevel = unQuantLevel >> unquantShift; |
964 | 0 | X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n"); |
965 | 0 | int d = abs(signCoef) - unquantAbsLevel; |
966 | 0 | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); |
967 | | |
968 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ |
969 | 0 | if (usePsyMask & scanPos) |
970 | 0 | { |
971 | 0 | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); |
972 | 0 | curCost -= PSYVALUE(reconCoef); |
973 | 0 | } |
974 | |
|
975 | 0 | if (curCost < costCoeff[scanPos]) |
976 | 0 | { |
977 | 0 | level = 1; |
978 | 0 | costCoeff[scanPos] = curCost; |
979 | 0 | costSig[scanPos] = SIGCOST(sigCoefBits); |
980 | 0 | } |
981 | 0 | } |
982 | 0 | else if (maxAbsLevel) |
983 | 0 | { |
984 | 0 | uint32_t levelBits0 = getICRateCost(maxAbsLevel, maxAbsLevel - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; |
985 | 0 | uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE; |
986 | |
|
987 | 0 | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); |
988 | |
|
989 | 0 | const int unquantAbsLevel0 = unQuantLevel >> unquantShift; |
990 | 0 | X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n"); |
991 | 0 | int d0 = abs(signCoef) - unquantAbsLevel0; |
992 | 0 | int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0); |
993 | |
|
994 | 0 | const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift; |
995 | 0 | X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n"); |
996 | 0 | int d1 = abs(signCoef) - unquantAbsLevel1; |
997 | 0 | int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1); |
998 | | |
999 | | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ |
1000 | 0 | if (usePsyMask & scanPos) |
1001 | 0 | { |
1002 | 0 | int reconCoef; |
1003 | 0 | reconCoef = abs(unquantAbsLevel0 + SIGN(predictedCoef, signCoef)); |
1004 | 0 | curCost0 -= PSYVALUE(reconCoef); |
1005 | |
|
1006 | 0 | reconCoef = abs(unquantAbsLevel1 + SIGN(predictedCoef, signCoef)); |
1007 | 0 | curCost1 -= PSYVALUE(reconCoef); |
1008 | 0 | } |
1009 | 0 | if (curCost0 < costCoeff[scanPos]) |
1010 | 0 | { |
1011 | 0 | level = maxAbsLevel; |
1012 | 0 | costCoeff[scanPos] = curCost0; |
1013 | 0 | costSig[scanPos] = SIGCOST(sigCoefBits); |
1014 | 0 | } |
1015 | 0 | if (curCost1 < costCoeff[scanPos]) |
1016 | 0 | { |
1017 | 0 | level = maxAbsLevel - 1; |
1018 | 0 | costCoeff[scanPos] = curCost1; |
1019 | 0 | costSig[scanPos] = SIGCOST(sigCoefBits); |
1020 | 0 | } |
1021 | 0 | } |
1022 | |
|
1023 | 0 | dstCoeff[blkPos] = (int16_t)level; |
1024 | 0 | totalRdCost += costCoeff[scanPos]; |
1025 | | |
1026 | | /* record costs for sign-hiding performed at the end */ |
1027 | 0 | if ((cu.m_slice->m_pps->bSignHideEnabled ? ~0 : 0) & level) |
1028 | 0 | { |
1029 | 0 | const int32_t diff0 = level - 1 - baseLevel; |
1030 | 0 | const int32_t diff2 = level + 1 - baseLevel; |
1031 | 0 | const int32_t maxVlc = g_goRiceRange[goRiceParam]; |
1032 | 0 | int rate0, rate1, rate2; |
1033 | |
|
1034 | 0 | if (diff0 < -2) // prob (92.9, 86.5, 74.5)% |
1035 | 0 | { |
1036 | | // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} |
1037 | | // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 |
1038 | 0 | X265_CHECK(level == 1, "absLevel check failure\n"); |
1039 | |
|
1040 | 0 | const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; |
1041 | 0 | const int rateNotEqual2 = greaterOneBits[0]; |
1042 | |
|
1043 | 0 | rate0 = 0; |
1044 | 0 | rate2 = rateEqual2; |
1045 | 0 | rate1 = rateNotEqual2; |
1046 | |
|
1047 | 0 | X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); |
1048 | 0 | X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); |
1049 | 0 | X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); |
1050 | 0 | } |
1051 | 0 | else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% |
1052 | 0 | { |
1053 | | // NOTE: no c1c2 correct rate since all of rate include this factor |
1054 | 0 | rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); |
1055 | 0 | rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); |
1056 | 0 | rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); |
1057 | 0 | } |
1058 | 0 | else |
1059 | 0 | { |
1060 | 0 | rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); |
1061 | 0 | rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); |
1062 | 0 | rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate); |
1063 | 0 | } |
1064 | 0 | rateIncUp[blkPos] = rate2 - rate1; |
1065 | 0 | rateIncDown[blkPos] = rate0 - rate1; |
1066 | 0 | } |
1067 | 0 | else |
1068 | 0 | { |
1069 | 0 | rateIncUp[blkPos] = greaterOneBits[0]; |
1070 | 0 | rateIncDown[blkPos] = 0; |
1071 | 0 | } |
1072 | | |
1073 | | /* Update CABAC estimation state */ |
1074 | 0 | if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold)) |
1075 | 0 | { |
1076 | 0 | goRiceParam++; |
1077 | 0 | levelThreshold <<= 1; |
1078 | 0 | } |
1079 | |
|
1080 | 0 | const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31; |
1081 | 0 | c1Idx += isNonZero; |
1082 | | |
1083 | | /* update bin model */ |
1084 | 0 | if (level > 1) |
1085 | 0 | { |
1086 | 0 | c1 = 0; |
1087 | 0 | c2 += (uint32_t)(c2 - 2) >> 31; |
1088 | 0 | c2Idx++; |
1089 | 0 | } |
1090 | 0 | else if (((c1 == 1) | (c1 == 2)) & isNonZero) |
1091 | 0 | c1++; |
1092 | |
|
1093 | 0 | if (dstCoeff[blkPos]) |
1094 | 0 | { |
1095 | 0 | sigCoeffGroupFlag64 |= cgBlkPosMask; |
1096 | 0 | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; |
1097 | 0 | cgRdStats.uncodedDist += costUncoded[blkPos]; |
1098 | 0 | cgRdStats.nnzBeforePos0 += scanPosinCG; |
1099 | 0 | } |
1100 | 0 | } |
1101 | |
|
1102 | 0 | cgRdStats.sigCost += costSig[scanPos]; |
1103 | 0 | } /* end for (scanPosinCG) */ |
1104 | |
|
1105 | 0 | X265_CHECK((cgScanPos << MLS_CG_SIZE) == (int)scanPos, "scanPos mistake\n"); |
1106 | 0 | cgRdStats.sigCost0 = costSig[scanPos]; |
1107 | |
|
1108 | 0 | costCoeffGroupSig[cgScanPos] = 0; |
1109 | | |
1110 | | /* nothing to do at this case */ |
1111 | 0 | X265_CHECK(cgLastScanPos >= 0, "cgLastScanPos check failure\n"); |
1112 | |
|
1113 | 0 | if (!cgScanPos || cgScanPos == cgLastScanPos) |
1114 | 0 | { |
1115 | | /* coeff group 0 is implied to be present, no signal cost */ |
1116 | | /* coeff group with last NZ is implied to be present, handled below */ |
1117 | 0 | } |
1118 | 0 | else if (sigCoeffGroupFlag64 & cgBlkPosMask) |
1119 | 0 | { |
1120 | 0 | if (!cgRdStats.nnzBeforePos0) |
1121 | 0 | { |
1122 | | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ |
1123 | 0 | totalRdCost -= cgRdStats.sigCost0; |
1124 | 0 | cgRdStats.sigCost -= cgRdStats.sigCost0; |
1125 | 0 | } |
1126 | | |
1127 | | /* there are coded coefficients in this group, but now we include the signaling cost |
1128 | | * of the significant coefficient group flag and evaluate whether the RD cost of the |
1129 | | * coded group is more than the RD cost of the uncoded group */ |
1130 | |
|
1131 | 0 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
1132 | |
|
1133 | 0 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); |
1134 | 0 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ |
1135 | 0 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ |
1136 | 0 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ |
1137 | |
|
1138 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); |
1139 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ |
1140 | |
|
1141 | 0 | if (costZeroCG < totalRdCost && m_rdoqLevel > 1) |
1142 | 0 | { |
1143 | 0 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; |
1144 | 0 | totalRdCost = costZeroCG; |
1145 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); |
1146 | | |
1147 | | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ |
1148 | 0 | const uint32_t blkPos = codeParams.scan[cgScanPos * cgSize]; |
1149 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1150 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1151 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1152 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1153 | 0 | } |
1154 | 0 | } |
1155 | 0 | else |
1156 | 0 | { |
1157 | | /* there were no coded coefficients in this coefficient group */ |
1158 | 0 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride); |
1159 | 0 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); |
1160 | 0 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ |
1161 | 0 | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ |
1162 | 0 | } |
1163 | 0 | } /* end for (cgScanPos) */ |
1164 | |
|
1165 | 0 | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); |
1166 | | |
1167 | | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ |
1168 | 0 | int64_t bestCost; |
1169 | 0 | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) |
1170 | 0 | { |
1171 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); |
1172 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); |
1173 | 0 | } |
1174 | 0 | else |
1175 | 0 | { |
1176 | 0 | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; |
1177 | 0 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); |
1178 | 0 | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); |
1179 | 0 | } |
1180 | | |
1181 | | /* This loop starts with the last non-zero found in the first loop and then refines this last |
1182 | | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs |
1183 | | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out |
1184 | | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty |
1185 | | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ |
1186 | 0 | int bestLastIdx = 0; |
1187 | 0 | bool foundLast = false; |
1188 | 0 | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) |
1189 | 0 | { |
1190 | 0 | if (!cgScanPos || cgScanPos == cgLastScanPos) |
1191 | 0 | { |
1192 | | /* the presence of these coefficient groups are inferred, they have no bit in |
1193 | | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ |
1194 | 0 | } |
1195 | 0 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) |
1196 | 0 | { |
1197 | | /* remove cost of significant coeff group flag, the group's presence would be inferred |
1198 | | * from lastNZ if it were present in this group */ |
1199 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; |
1200 | 0 | } |
1201 | 0 | else |
1202 | 0 | { |
1203 | | /* remove cost of signaling this empty group as not present */ |
1204 | 0 | totalRdCost -= costCoeffGroupSig[cgScanPos]; |
1205 | 0 | continue; |
1206 | 0 | } |
1207 | | |
1208 | 0 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) |
1209 | 0 | { |
1210 | 0 | scanPos = cgScanPos * cgSize + scanPosinCG; |
1211 | 0 | if ((int)scanPos > lastScanPos) |
1212 | 0 | continue; |
1213 | | |
1214 | | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then |
1215 | | * continue as if it were uncoded. If the coefficient was already uncoded, remove the |
1216 | | * cost of signaling it as not-significant */ |
1217 | 0 | uint32_t blkPos = codeParams.scan[scanPos]; |
1218 | 0 | if (dstCoeff[blkPos]) |
1219 | 0 | { |
1220 | | // Calculates the cost of signaling the last significant coefficient in the block |
1221 | 0 | uint32_t pos[2] = { (blkPos & (trSize - 1)), (blkPos >> log2TrSize) }; |
1222 | 0 | if (codeParams.scanType == SCAN_VER) |
1223 | 0 | std::swap(pos[0], pos[1]); |
1224 | 0 | uint32_t bitsLastNZ = 0; |
1225 | |
|
1226 | 0 | for (int i = 0; i < 2; i++) |
1227 | 0 | { |
1228 | 0 | int temp = g_lastCoeffTable[pos[i]]; |
1229 | 0 | int prefixOnes = temp & 15; |
1230 | 0 | int suffixLen = temp >> 4; |
1231 | |
|
1232 | 0 | bitsLastNZ += m_entropyCoder->m_estBitsSbac.lastBits[i][prefixOnes]; |
1233 | 0 | bitsLastNZ += IEP_RATE * suffixLen; |
1234 | 0 | } |
1235 | |
|
1236 | 0 | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); |
1237 | |
|
1238 | 0 | if (costAsLast < bestCost) |
1239 | 0 | { |
1240 | 0 | bestLastIdx = scanPos + 1; |
1241 | 0 | bestCost = costAsLast; |
1242 | 0 | } |
1243 | 0 | if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) |
1244 | 0 | { |
1245 | 0 | foundLast = true; |
1246 | 0 | break; |
1247 | 0 | } |
1248 | | |
1249 | 0 | totalRdCost -= costCoeff[scanPos]; |
1250 | 0 | totalRdCost += costUncoded[blkPos]; |
1251 | 0 | } |
1252 | 0 | else |
1253 | 0 | totalRdCost -= costSig[scanPos]; |
1254 | 0 | } |
1255 | 0 | } |
1256 | | |
1257 | | /* recount non-zero coefficients and re-apply sign of DCT coef */ |
1258 | 0 | numSig = 0; |
1259 | 0 | for (int pos = 0; pos < bestLastIdx; pos++) |
1260 | 0 | { |
1261 | 0 | int blkPos = codeParams.scan[pos]; |
1262 | 0 | int level = dstCoeff[blkPos]; |
1263 | 0 | numSig += (level != 0); |
1264 | |
|
1265 | 0 | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; |
1266 | 0 | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); |
1267 | 0 | } |
1268 | | |
1269 | | // Average 49.62 pixels |
1270 | | /* clean uncoded coefficients */ |
1271 | 0 | X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n"); |
1272 | 0 | for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++) |
1273 | 0 | { |
1274 | 0 | dstCoeff[codeParams.scan[pos]] = 0; |
1275 | 0 | } |
1276 | 0 | for (int pos = (bestLastIdx & ~(SCAN_SET_SIZE - 1)) + SCAN_SET_SIZE; pos <= lastScanPos; pos += SCAN_SET_SIZE) |
1277 | 0 | { |
1278 | 0 | const uint32_t blkPos = codeParams.scan[pos]; |
1279 | 0 | memset(&dstCoeff[blkPos + 0 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1280 | 0 | memset(&dstCoeff[blkPos + 1 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1281 | 0 | memset(&dstCoeff[blkPos + 2 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1282 | 0 | memset(&dstCoeff[blkPos + 3 * trSize], 0, 4 * sizeof(*dstCoeff)); |
1283 | 0 | } |
1284 | | |
1285 | | /* rate-distortion based sign-hiding */ |
1286 | 0 | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) |
1287 | 0 | { |
1288 | 0 | const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE; |
1289 | 0 | int lastCG = 1; |
1290 | |
|
1291 | 0 | for (int subSet = realLastScanPos; subSet >= 0; subSet--) |
1292 | 0 | { |
1293 | 0 | int subPos = subSet << LOG2_SCAN_SET_SIZE; |
1294 | 0 | int n; |
1295 | |
|
1296 | 0 | if (!(sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[subSet]))) |
1297 | 0 | continue; |
1298 | | |
1299 | | /* measure distance between first and last non-zero coef in this |
1300 | | * coding group */ |
1301 | 0 | const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]); |
1302 | 0 | const int firstNZPosInCG = (uint8_t)posFirstLast; |
1303 | 0 | const int lastNZPosInCG = (int8_t)(posFirstLast >> 8); |
1304 | 0 | const uint32_t absSumSign = posFirstLast; |
1305 | |
|
1306 | 0 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) |
1307 | 0 | { |
1308 | 0 | const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]); |
1309 | |
|
1310 | | #if CHECKED_BUILD || _DEBUG |
1311 | | int32_t absSum_dummy = 0; |
1312 | | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) |
1313 | | absSum_dummy += dstCoeff[codeParams.scan[n + subPos]]; |
1314 | | X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n"); |
1315 | | #endif |
1316 | | |
1317 | | //if (signbit != absSumSign) |
1318 | 0 | if (((int32_t)(signbit ^ absSumSign)) < 0) |
1319 | 0 | { |
1320 | | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff |
1321 | | * is properly implied. Note dstCoeff[] are signed by this point but curChange and |
1322 | | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ |
1323 | |
|
1324 | 0 | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; |
1325 | 0 | uint32_t minPos = 0; |
1326 | 0 | int8_t finalChange = 0; |
1327 | 0 | int curChange = 0; |
1328 | 0 | uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE; |
1329 | |
|
1330 | 0 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) |
1331 | 0 | { |
1332 | 0 | const uint32_t blkPos = codeParams.scan[n + subPos]; |
1333 | 0 | const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ |
1334 | 0 | const int absLevel = abs(dstCoeff[blkPos]); |
1335 | | // TODO: this is constant in non-scaling mode |
1336 | 0 | const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per); |
1337 | 0 | const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound); |
1338 | |
|
1339 | 0 | int d = abs(signCoef) - (unQuantLevel >> unquantShift); |
1340 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n"); |
1341 | |
|
1342 | 0 | const int64_t origDist = (((int64_t)d * d)); |
1343 | |
|
1344 | 0 | #define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8)) |
1345 | |
|
1346 | 0 | const uint32_t isOne = (absLevel == 1); |
1347 | 0 | if (dstCoeff[blkPos]) |
1348 | 0 | { |
1349 | 0 | d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift); |
1350 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); |
1351 | 0 | int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]); |
1352 | | |
1353 | | /* if decrementing would make the coeff 0, we can include the |
1354 | | * significant coeff flag cost savings */ |
1355 | 0 | d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift); |
1356 | 0 | X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n"); |
1357 | 0 | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); |
1358 | 0 | int64_t costDown = DELTARDCOST(origDist, d, downBits); |
1359 | |
|
1360 | 0 | costDown -= lastCoeffAdjust; |
1361 | 0 | curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown; |
1362 | |
|
1363 | 0 | curChange = 2 * (costUp < costDown) - 1; |
1364 | 0 | curCost = (costUp < costDown) ? costUp : curCost; |
1365 | 0 | } |
1366 | | //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31))) |
1367 | 0 | else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0)) |
1368 | 0 | { |
1369 | | /* don't try to make a new coded coeff before the first coeff if its |
1370 | | * sign would be different than the first coeff, the inferred sign would |
1371 | | * still be wrong and we'd have to do this again. */ |
1372 | 0 | curCost = MAX_INT64; |
1373 | 0 | } |
1374 | 0 | else |
1375 | 0 | { |
1376 | | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ |
1377 | 0 | d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift); |
1378 | 0 | X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n"); |
1379 | 0 | curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); |
1380 | 0 | curChange = 1; |
1381 | 0 | } |
1382 | |
|
1383 | 0 | if (curCost < minCostInc) |
1384 | 0 | { |
1385 | 0 | minCostInc = curCost; |
1386 | 0 | finalChange = (int8_t)curChange; |
1387 | 0 | minPos = blkPos + (absLevel << 16); |
1388 | 0 | } |
1389 | 0 | lastCoeffAdjust = 0; |
1390 | 0 | } |
1391 | |
|
1392 | 0 | const int absInMinPos = (minPos >> 16); |
1393 | 0 | minPos = (uint16_t)minPos; |
1394 | | |
1395 | | // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) |
1396 | 0 | if (absInMinPos >= 32767) |
1397 | | /* don't allow sign hiding to violate the SPEC range */ |
1398 | 0 | finalChange = -1; |
1399 | | |
1400 | | // NOTE: Reference code |
1401 | | //if (dstCoeff[minPos] == 0) |
1402 | | // numSig++; |
1403 | | //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) |
1404 | | // numSig--; |
1405 | 0 | numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1)); |
1406 | | |
1407 | | |
1408 | | // NOTE: Reference code |
1409 | | //if (m_resiDctCoeff[minPos] >= 0) |
1410 | | // dstCoeff[minPos] += finalChange; |
1411 | | //else |
1412 | | // dstCoeff[minPos] -= finalChange; |
1413 | 0 | const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16); |
1414 | 0 | dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign); |
1415 | 0 | } |
1416 | 0 | } |
1417 | |
|
1418 | 0 | lastCG = 0; |
1419 | 0 | } |
1420 | 0 | } |
1421 | |
|
1422 | 0 | return numSig; |
1423 | 0 | } Unexecuted instantiation: unsigned int x265::Quant::rdoQuant<2u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Unexecuted instantiation: unsigned int x265::Quant::rdoQuant<3u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Unexecuted instantiation: unsigned int x265::Quant::rdoQuant<4u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) Unexecuted instantiation: unsigned int x265::Quant::rdoQuant<5u>(x265::CUData const&, short*, x265::TextType, unsigned int, bool) |
1424 | | |
1425 | | /* Context derivation process of coeff_abs_significant_flag */ |
1426 | | uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma, |
1427 | | uint32_t firstSignificanceMapContext) |
1428 | 0 | { |
1429 | 0 | static const uint8_t ctxIndMap[16] = |
1430 | 0 | { |
1431 | 0 | 0, 1, 4, 5, |
1432 | 0 | 2, 3, 4, 5, |
1433 | 0 | 6, 6, 8, 8, |
1434 | 0 | 7, 7, 8, 8 |
1435 | 0 | }; |
1436 | |
|
1437 | 0 | if (!blkPos) // special case for the DC context variable |
1438 | 0 | return 0; |
1439 | | |
1440 | 0 | if (log2TrSize == 2) // 4x4 |
1441 | 0 | return ctxIndMap[blkPos]; |
1442 | | |
1443 | 0 | const uint32_t posY = blkPos >> log2TrSize; |
1444 | 0 | const uint32_t posX = blkPos & (trSize - 1); |
1445 | 0 | X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n"); |
1446 | |
|
1447 | 0 | int posXinSubset = blkPos & 3; |
1448 | 0 | X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n"); |
1449 | 0 | int posYinSubset = posY & 3; |
1450 | | |
1451 | | // NOTE: [patternSigCtx][posXinSubset][posYinSubset] |
1452 | 0 | static const uint8_t table_cnt[4][4][4] = |
1453 | 0 | { |
1454 | | // patternSigCtx = 0 |
1455 | 0 | { |
1456 | 0 | { 2, 1, 1, 0 }, |
1457 | 0 | { 1, 1, 0, 0 }, |
1458 | 0 | { 1, 0, 0, 0 }, |
1459 | 0 | { 0, 0, 0, 0 }, |
1460 | 0 | }, |
1461 | | // patternSigCtx = 1 |
1462 | 0 | { |
1463 | 0 | { 2, 1, 0, 0 }, |
1464 | 0 | { 2, 1, 0, 0 }, |
1465 | 0 | { 2, 1, 0, 0 }, |
1466 | 0 | { 2, 1, 0, 0 }, |
1467 | 0 | }, |
1468 | | // patternSigCtx = 2 |
1469 | 0 | { |
1470 | 0 | { 2, 2, 2, 2 }, |
1471 | 0 | { 1, 1, 1, 1 }, |
1472 | 0 | { 0, 0, 0, 0 }, |
1473 | 0 | { 0, 0, 0, 0 }, |
1474 | 0 | }, |
1475 | | // patternSigCtx = 3 |
1476 | 0 | { |
1477 | 0 | { 2, 2, 2, 2 }, |
1478 | 0 | { 2, 2, 2, 2 }, |
1479 | 0 | { 2, 2, 2, 2 }, |
1480 | 0 | { 2, 2, 2, 2 }, |
1481 | 0 | } |
1482 | 0 | }; |
1483 | |
|
1484 | 0 | int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset]; |
1485 | 0 | int offset = firstSignificanceMapContext; |
1486 | |
|
1487 | 0 | offset += cnt; |
1488 | |
|
1489 | 0 | return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset; |
1490 | 0 | } |
1491 | | |