/src/x265/source/encoder/search.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * Min Chen <chenm003@163.com> |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 2 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
20 | | * |
21 | | * This program is also available under a commercial proprietary license. |
22 | | * For more information, contact us at license @ x265.com. |
23 | | *****************************************************************************/ |
24 | | |
25 | | #include "common.h" |
26 | | #include "primitives.h" |
27 | | #include "picyuv.h" |
28 | | #include "cudata.h" |
29 | | |
30 | | #include "search.h" |
31 | | #include "entropy.h" |
32 | | #include "rdcost.h" |
33 | | |
34 | | #include "analysis.h" // TLD |
35 | | #include "framedata.h" |
36 | | |
37 | | using namespace X265_NS; |
38 | | |
39 | | #if _MSC_VER |
40 | | #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning) |
41 | | #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data) |
42 | | #pragma warning(disable: 4127) // conditional expression is constant |
43 | | #endif |
44 | | |
45 | 0 | #define MVP_IDX_BITS 1 |
46 | | |
47 | | ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 }; |
48 | | |
49 | | Search::Search() |
50 | 0 | { |
51 | 0 | memset(m_rqt, 0, sizeof(m_rqt)); |
52 | |
|
53 | 0 | for (int i = 0; i < 3; i++) |
54 | 0 | { |
55 | 0 | m_qtTempTransformSkipFlag[i] = NULL; |
56 | 0 | m_qtTempCbf[i] = NULL; |
57 | 0 | } |
58 | |
|
59 | 0 | m_numLayers = 0; |
60 | 0 | m_intraPred = NULL; |
61 | 0 | m_intraPredAngs = NULL; |
62 | 0 | m_fencScaled = NULL; |
63 | 0 | m_fencTransposed = NULL; |
64 | 0 | m_tsCoeff = NULL; |
65 | 0 | m_tsResidual = NULL; |
66 | 0 | m_tsRecon = NULL; |
67 | 0 | m_param = NULL; |
68 | 0 | m_slice = NULL; |
69 | 0 | m_frame = NULL; |
70 | 0 | m_maxTUDepth = -1; |
71 | 0 | } |
72 | | |
73 | | bool Search::initSearch(const x265_param& param, ScalingList& scalingList) |
74 | 0 | { |
75 | 0 | uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize]; |
76 | 0 | m_param = ¶m; |
77 | 0 | m_bFrameParallel = param.frameNumThreads > 1; |
78 | 0 | m_numLayers = g_log2Size[param.maxCUSize] - 2; |
79 | |
|
80 | 0 | m_rdCost.setPsyRdScale(param.psyRd); |
81 | 0 | m_rdCost.setSsimRd(param.bSsimRd); |
82 | 0 | m_me.init(param.internalCsp); |
83 | |
|
84 | 0 | bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder); |
85 | 0 | if (m_param->noiseReductionIntra || m_param->noiseReductionInter ) |
86 | 0 | ok &= m_quant.allocNoiseReduction(param); |
87 | |
|
88 | 0 | ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */ |
89 | | |
90 | | /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed |
91 | | * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */ |
92 | 0 | m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight; |
93 | |
|
94 | 0 | uint32_t sizeL = 1 << (maxLog2CUSize * 2); |
95 | 0 | uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); |
96 | 0 | uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2; |
97 | |
|
98 | 0 | m_limitTU = 0; |
99 | 0 | if (m_param->limitTU) |
100 | 0 | { |
101 | 0 | if (m_param->limitTU == 1) |
102 | 0 | m_limitTU = X265_TU_LIMIT_BFS; |
103 | 0 | else if (m_param->limitTU == 2) |
104 | 0 | m_limitTU = X265_TU_LIMIT_DFS; |
105 | 0 | else if (m_param->limitTU == 3) |
106 | 0 | m_limitTU = X265_TU_LIMIT_NEIGH; |
107 | 0 | else if (m_param->limitTU == 4) |
108 | 0 | m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH; |
109 | 0 | } |
110 | | |
111 | | /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32 |
112 | | * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts |
113 | | * which are reconstructed at each depth are valid. At the end, the transform depth table |
114 | | * is walked and the coeff and recon at the correct depths are collected */ |
115 | |
|
116 | 0 | if (param.internalCsp != X265_CSP_I400) |
117 | 0 | { |
118 | 0 | for (uint32_t i = 0; i <= m_numLayers; i++) |
119 | 0 | { |
120 | 0 | CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2); |
121 | 0 | m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL; |
122 | 0 | m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC; |
123 | 0 | ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp); |
124 | 0 | ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp); |
125 | 0 | } |
126 | 0 | } |
127 | 0 | else |
128 | 0 | { |
129 | 0 | for (uint32_t i = 0; i <= m_numLayers; i++) |
130 | 0 | { |
131 | 0 | CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL); |
132 | 0 | m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL; |
133 | 0 | ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp); |
134 | 0 | ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp); |
135 | 0 | } |
136 | 0 | } |
137 | | |
138 | | /* the rest of these buffers are indexed per-depth */ |
139 | 0 | for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) |
140 | 0 | { |
141 | 0 | int cuSize = param.maxCUSize >> i; |
142 | 0 | ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp); |
143 | 0 | ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp); |
144 | 0 | ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp); |
145 | 0 | ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp); |
146 | 0 | } |
147 | |
|
148 | 0 | if (param.internalCsp != X265_CSP_I400) |
149 | 0 | { |
150 | 0 | CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3); |
151 | 0 | m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions; |
152 | 0 | m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2; |
153 | 0 | CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3); |
154 | 0 | m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions; |
155 | 0 | m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2; |
156 | 0 | } |
157 | 0 | else |
158 | 0 | { |
159 | 0 | CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions); |
160 | 0 | m_qtTempCbf[1] = m_qtTempCbf[2] = NULL; |
161 | 0 | CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions); |
162 | 0 | m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL; |
163 | 0 | } |
164 | | |
165 | 0 | CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3)); |
166 | 0 | m_fencScaled = m_intraPred + 32 * 32; |
167 | 0 | m_fencTransposed = m_fencScaled + 32 * 32; |
168 | 0 | m_intraPredAngs = m_fencTransposed + 32 * 32; |
169 | |
|
170 | 0 | CHECKED_MALLOC(m_tsCoeff, coeff_t, MAX_TS_SIZE * MAX_TS_SIZE); |
171 | 0 | CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE); |
172 | 0 | CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE); |
173 | |
|
174 | 0 | return ok; |
175 | | |
176 | 0 | fail: |
177 | 0 | return false; |
178 | 0 | } |
179 | | |
180 | | Search::~Search() |
181 | 0 | { |
182 | 0 | for (uint32_t i = 0; i <= m_numLayers; i++) |
183 | 0 | { |
184 | 0 | X265_FREE(m_rqt[i].coeffRQT[0]); |
185 | 0 | m_rqt[i].reconQtYuv.destroy(); |
186 | 0 | m_rqt[i].resiQtYuv.destroy(); |
187 | 0 | } |
188 | |
|
189 | 0 | for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) |
190 | 0 | { |
191 | 0 | m_rqt[i].tmpResiYuv.destroy(); |
192 | 0 | m_rqt[i].tmpPredYuv.destroy(); |
193 | 0 | m_rqt[i].bidirPredYuv[0].destroy(); |
194 | 0 | m_rqt[i].bidirPredYuv[1].destroy(); |
195 | 0 | } |
196 | |
|
197 | 0 | X265_FREE(m_qtTempCbf[0]); |
198 | 0 | X265_FREE(m_qtTempTransformSkipFlag[0]); |
199 | 0 | X265_FREE(m_intraPred); |
200 | 0 | X265_FREE(m_tsCoeff); |
201 | 0 | X265_FREE(m_tsResidual); |
202 | 0 | X265_FREE(m_tsRecon); |
203 | 0 | } |
204 | | |
205 | | int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp) |
206 | 0 | { |
207 | 0 | X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n"); |
208 | |
|
209 | 0 | m_me.setQP(qp); |
210 | 0 | m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp); |
211 | |
|
212 | 0 | int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp); |
213 | 0 | m_quant.setQPforQuant(ctu, quantQP); |
214 | 0 | return quantQP; |
215 | 0 | } |
216 | | |
217 | | #if CHECKED_BUILD || _DEBUG |
218 | | void Search::invalidateContexts(int fromDepth) |
219 | | { |
220 | | /* catch reads without previous writes */ |
221 | | for (int d = fromDepth; d < NUM_FULL_DEPTH; d++) |
222 | | { |
223 | | m_rqt[d].cur.markInvalid(); |
224 | | m_rqt[d].rqtTemp.markInvalid(); |
225 | | m_rqt[d].rqtRoot.markInvalid(); |
226 | | m_rqt[d].rqtTest.markInvalid(); |
227 | | } |
228 | | } |
229 | | #else |
230 | 0 | void Search::invalidateContexts(int) {} |
231 | | #endif |
232 | | |
233 | | void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx) |
234 | 0 | { |
235 | 0 | uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx]; |
236 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
237 | |
|
238 | 0 | if (!(log2TrSize - m_hChromaShift < 2)) |
239 | 0 | { |
240 | 0 | uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2); |
241 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1)) |
242 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv); |
243 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1)) |
244 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv); |
245 | 0 | } |
246 | |
|
247 | 0 | if (subdiv) |
248 | 0 | { |
249 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
250 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
251 | 0 | codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx); |
252 | 0 | } |
253 | 0 | } |
254 | | |
255 | | void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype) |
256 | 0 | { |
257 | 0 | if (!cu.getCbf(absPartIdx, ttype, tuDepth)) |
258 | 0 | return; |
259 | | |
260 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
261 | |
|
262 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
263 | 0 | { |
264 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
265 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
266 | 0 | codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype); |
267 | |
|
268 | 0 | return; |
269 | 0 | } |
270 | | |
271 | 0 | uint32_t tuDepthC = tuDepth; |
272 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
273 | |
|
274 | 0 | if (log2TrSizeC < 2) |
275 | 0 | { |
276 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
277 | 0 | if (absPartIdx & 3) |
278 | 0 | return; |
279 | 0 | log2TrSizeC = 2; |
280 | 0 | tuDepthC--; |
281 | 0 | } |
282 | | |
283 | 0 | uint32_t qtLayer = log2TrSize - 2; |
284 | |
|
285 | 0 | if (m_csp != X265_CSP_I422) |
286 | 0 | { |
287 | 0 | uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0; |
288 | 0 | uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift); |
289 | 0 | coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; |
290 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); |
291 | 0 | } |
292 | 0 | else |
293 | 0 | { |
294 | 0 | uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1); |
295 | 0 | coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; |
296 | 0 | uint32_t subTUSize = 1 << (log2TrSizeC * 2); |
297 | 0 | uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2); |
298 | 0 | if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) |
299 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); |
300 | 0 | if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) |
301 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype); |
302 | 0 | } |
303 | 0 | } |
304 | | |
305 | | void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2]) |
306 | 0 | { |
307 | 0 | CUData& cu = mode.cu; |
308 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
309 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
310 | 0 | uint32_t qtLayer = log2TrSize - 2; |
311 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
312 | 0 | bool mightNotSplit = log2TrSize <= depthRange[1]; |
313 | 0 | bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit); |
314 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
315 | | |
316 | | /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */ |
317 | 0 | if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4) |
318 | 0 | { |
319 | 0 | mightNotSplit = false; |
320 | 0 | mightSplit = true; |
321 | 0 | } |
322 | |
|
323 | 0 | Cost fullCost; |
324 | 0 | uint32_t bCBF = 0; |
325 | |
|
326 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); |
327 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size; |
328 | |
|
329 | 0 | if (mightNotSplit) |
330 | 0 | { |
331 | 0 | if (mightSplit) |
332 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); |
333 | |
|
334 | 0 | const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); |
335 | 0 | pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); |
336 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); |
337 | 0 | uint32_t stride = mode.fencYuv->m_size; |
338 | | |
339 | | // init availability pattern |
340 | 0 | uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; |
341 | 0 | IntraNeighbors intraNeighbors; |
342 | 0 | initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors); |
343 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode); |
344 | | |
345 | | // get prediction signal |
346 | 0 | predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); |
347 | |
|
348 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
349 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
350 | |
|
351 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
352 | 0 | coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
353 | | |
354 | | // store original entropy coding status |
355 | 0 | if (bEnableRDOQ) |
356 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
357 | 0 | primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
358 | |
|
359 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
360 | 0 | if (numSig) |
361 | 0 | { |
362 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig); |
363 | 0 | bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
364 | 0 | bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
365 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
366 | 0 | bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign; |
367 | 0 | primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride); |
368 | 0 | } |
369 | 0 | else |
370 | | // no coded residual, recon = pred |
371 | 0 | primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride); |
372 | |
|
373 | 0 | bCBF = !!numSig << tuDepth; |
374 | 0 | cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); |
375 | 0 | fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride); |
376 | |
|
377 | 0 | m_entropyCoder.resetBits(); |
378 | 0 | if (!absPartIdx) |
379 | 0 | { |
380 | 0 | if (!cu.m_slice->isIntra()) |
381 | 0 | { |
382 | 0 | if (cu.m_slice->m_pps->bTransquantBypassEnabled) |
383 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
384 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
385 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
386 | 0 | } |
387 | |
|
388 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
389 | 0 | } |
390 | 0 | if (cu.m_partSize[0] == SIZE_2Nx2N) |
391 | 0 | { |
392 | 0 | if (!absPartIdx) |
393 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, 0, false); |
394 | 0 | } |
395 | 0 | else |
396 | 0 | { |
397 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
398 | 0 | if (!tuDepth) |
399 | 0 | { |
400 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) |
401 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); |
402 | 0 | } |
403 | 0 | else if (!(absPartIdx & (qNumParts - 1))) |
404 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); |
405 | 0 | } |
406 | 0 | if (log2TrSize != depthRange[0]) |
407 | 0 | m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); |
408 | |
|
409 | 0 | m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); |
410 | |
|
411 | 0 | if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) |
412 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA); |
413 | |
|
414 | 0 | fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); |
415 | |
|
416 | 0 | if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE) |
417 | 0 | fullCost.bits *= 4; |
418 | |
|
419 | 0 | if (m_rdCost.m_psyRd) |
420 | 0 | { |
421 | 0 | fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride); |
422 | 0 | fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
423 | 0 | } |
424 | 0 | else if(m_rdCost.m_ssimRd) |
425 | 0 | { |
426 | 0 | fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx); |
427 | 0 | fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
428 | 0 | } |
429 | 0 | else |
430 | 0 | fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits); |
431 | 0 | } |
432 | 0 | else |
433 | 0 | fullCost.rdcost = MAX_INT64; |
434 | |
|
435 | 0 | if (mightSplit) |
436 | 0 | { |
437 | 0 | if (mightNotSplit) |
438 | 0 | { |
439 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode |
440 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode |
441 | 0 | } |
442 | | |
443 | | /* code split block */ |
444 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
445 | |
|
446 | 0 | int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; |
447 | 0 | if (m_param->bEnableTSkipFast) |
448 | 0 | checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N; |
449 | |
|
450 | 0 | Cost splitCost; |
451 | 0 | uint32_t cbf = 0; |
452 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
453 | 0 | { |
454 | 0 | if (checkTransformSkip) |
455 | 0 | codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost); |
456 | 0 | else |
457 | 0 | codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange); |
458 | |
|
459 | 0 | cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
460 | 0 | } |
461 | 0 | cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth); |
462 | |
|
463 | 0 | if (mightNotSplit && log2TrSize != depthRange[0]) |
464 | 0 | { |
465 | | /* If we could have coded this TU depth, include cost of subdiv flag */ |
466 | 0 | m_entropyCoder.resetBits(); |
467 | 0 | m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); |
468 | 0 | splitCost.bits += m_entropyCoder.getNumberOfWrittenBits(); |
469 | |
|
470 | 0 | if (m_rdCost.m_psyRd) |
471 | 0 | splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
472 | 0 | else if(m_rdCost.m_ssimRd) |
473 | 0 | splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
474 | 0 | else |
475 | 0 | splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits); |
476 | 0 | } |
477 | |
|
478 | 0 | if (splitCost.rdcost < fullCost.rdcost) |
479 | 0 | { |
480 | 0 | outCost.rdcost += splitCost.rdcost; |
481 | 0 | outCost.distortion += splitCost.distortion; |
482 | 0 | outCost.bits += splitCost.bits; |
483 | 0 | outCost.energy += splitCost.energy; |
484 | 0 | return; |
485 | 0 | } |
486 | 0 | else |
487 | 0 | { |
488 | | // recover entropy state of full-size TU encode |
489 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtTest); |
490 | | |
491 | | // recover transform index and Cbf values |
492 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
493 | 0 | cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); |
494 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
495 | 0 | } |
496 | 0 | } |
497 | | |
498 | | // set reconstruction for next intra prediction blocks if full TU prediction won |
499 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
500 | 0 | pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
501 | 0 | intptr_t picStride = reconPic->m_stride; |
502 | 0 | primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride); |
503 | |
|
504 | 0 | outCost.rdcost += fullCost.rdcost; |
505 | 0 | outCost.distortion += fullCost.distortion; |
506 | 0 | outCost.bits += fullCost.bits; |
507 | 0 | outCost.energy += fullCost.energy; |
508 | 0 | } |
509 | | |
510 | | void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost) |
511 | 0 | { |
512 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
513 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
514 | 0 | uint32_t tuSize = 1 << log2TrSize; |
515 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
516 | |
|
517 | 0 | X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n"); |
518 | |
|
519 | 0 | CUData& cu = mode.cu; |
520 | 0 | Yuv* predYuv = &mode.predYuv; |
521 | 0 | const Yuv* fencYuv = mode.fencYuv; |
522 | |
|
523 | 0 | Cost fullCost; |
524 | 0 | fullCost.rdcost = MAX_INT64; |
525 | 0 | int bTSkip = 0; |
526 | 0 | uint32_t bCBF = 0; |
527 | |
|
528 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
529 | 0 | pixel* pred = predYuv->getLumaAddr(absPartIdx); |
530 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); |
531 | 0 | uint32_t stride = fencYuv->m_size; |
532 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
533 | | |
534 | | // init availability pattern |
535 | 0 | uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; |
536 | 0 | IntraNeighbors intraNeighbors; |
537 | 0 | initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors); |
538 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode); |
539 | | |
540 | | // get prediction signal |
541 | 0 | predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); |
542 | |
|
543 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
544 | |
|
545 | 0 | uint32_t qtLayer = log2TrSize - 2; |
546 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
547 | 0 | coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
548 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); |
549 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size; |
550 | | |
551 | | // store original entropy coding status |
552 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); |
553 | |
|
554 | 0 | if (bEnableRDOQ) |
555 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
556 | |
|
557 | 0 | int checkTransformSkip = 1; |
558 | 0 | for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) |
559 | 0 | { |
560 | 0 | uint64_t tmpCost; |
561 | 0 | uint32_t tmpEnergy = 0; |
562 | |
|
563 | 0 | coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY); |
564 | 0 | pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt); |
565 | 0 | bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0)); |
566 | 0 | uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); |
567 | |
|
568 | 0 | primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
569 | |
|
570 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip); |
571 | 0 | if (numSig) |
572 | 0 | { |
573 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig); |
574 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0; |
575 | 0 | bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0; |
576 | 0 | bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign; |
577 | 0 | primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride); |
578 | 0 | } |
579 | 0 | else if (useTSkip) |
580 | 0 | { |
581 | | /* do not allow tskip if CBF=0, pretend we did not try tskip */ |
582 | 0 | checkTransformSkip = 0; |
583 | 0 | break; |
584 | 0 | } |
585 | 0 | else |
586 | | // no residual coded, recon = pred |
587 | 0 | primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride); |
588 | | |
589 | 0 | sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride); |
590 | |
|
591 | 0 | cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth); |
592 | 0 | cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); |
593 | |
|
594 | 0 | if (useTSkip) |
595 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); |
596 | |
|
597 | 0 | m_entropyCoder.resetBits(); |
598 | 0 | if (!absPartIdx) |
599 | 0 | { |
600 | 0 | if (!cu.m_slice->isIntra()) |
601 | 0 | { |
602 | 0 | if (cu.m_slice->m_pps->bTransquantBypassEnabled) |
603 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
604 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
605 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
606 | 0 | } |
607 | |
|
608 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
609 | 0 | } |
610 | 0 | if (cu.m_partSize[0] == SIZE_2Nx2N) |
611 | 0 | { |
612 | 0 | if (!absPartIdx) |
613 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, 0, false); |
614 | 0 | } |
615 | 0 | else |
616 | 0 | { |
617 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
618 | 0 | if (!tuDepth) |
619 | 0 | { |
620 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) |
621 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); |
622 | 0 | } |
623 | 0 | else if (!(absPartIdx & (qNumParts - 1))) |
624 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); |
625 | 0 | } |
626 | 0 | m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); |
627 | |
|
628 | 0 | m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); |
629 | |
|
630 | 0 | if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) |
631 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA); |
632 | |
|
633 | 0 | uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits(); |
634 | |
|
635 | 0 | if (!useTSkip) |
636 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtTemp); |
637 | |
|
638 | 0 | if (m_rdCost.m_psyRd) |
639 | 0 | { |
640 | 0 | tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride); |
641 | 0 | tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy); |
642 | 0 | } |
643 | 0 | else if(m_rdCost.m_ssimRd) |
644 | 0 | { |
645 | 0 | tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx); |
646 | 0 | tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy); |
647 | 0 | } |
648 | 0 | else |
649 | 0 | tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits); |
650 | |
|
651 | 0 | if (tmpCost < fullCost.rdcost) |
652 | 0 | { |
653 | 0 | bTSkip = useTSkip; |
654 | 0 | bCBF = !!numSig; |
655 | 0 | fullCost.rdcost = tmpCost; |
656 | 0 | fullCost.distortion = tmpDist; |
657 | 0 | fullCost.bits = tmpBits; |
658 | 0 | fullCost.energy = tmpEnergy; |
659 | 0 | } |
660 | 0 | } |
661 | |
|
662 | 0 | if (bTSkip) |
663 | 0 | { |
664 | 0 | memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2)); |
665 | 0 | primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize); |
666 | 0 | } |
667 | 0 | else if (checkTransformSkip) |
668 | 0 | { |
669 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
670 | 0 | cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); |
671 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtTemp); |
672 | 0 | } |
673 | | |
674 | | // set reconstruction for next intra prediction blocks |
675 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
676 | 0 | pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
677 | 0 | intptr_t picStride = reconPic->m_stride; |
678 | 0 | primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride); |
679 | |
|
680 | 0 | outCost.rdcost += fullCost.rdcost; |
681 | 0 | outCost.distortion += fullCost.distortion; |
682 | 0 | outCost.bits += fullCost.bits; |
683 | 0 | outCost.energy += fullCost.energy; |
684 | 0 | } |
685 | | |
686 | | /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */ |
687 | | void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]) |
688 | 0 | { |
689 | 0 | CUData& cu = mode.cu; |
690 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
691 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
692 | 0 | bool bCheckFull = log2TrSize <= depthRange[1]; |
693 | |
|
694 | 0 | X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n"); |
695 | | |
696 | | /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible |
697 | | * since we are not measuring RD cost */ |
698 | 0 | if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4) |
699 | 0 | bCheckFull = false; |
700 | |
|
701 | 0 | if (bCheckFull) |
702 | 0 | { |
703 | 0 | const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); |
704 | 0 | pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); |
705 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); |
706 | 0 | uint32_t stride = mode.fencYuv->m_size; |
707 | | |
708 | | // init availability pattern |
709 | 0 | uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; |
710 | 0 | IntraNeighbors intraNeighbors; |
711 | 0 | initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors); |
712 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode); |
713 | | |
714 | | // get prediction signal |
715 | 0 | predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); |
716 | |
|
717 | 0 | X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n"); |
718 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
719 | |
|
720 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
721 | 0 | coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY; |
722 | |
|
723 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
724 | 0 | primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
725 | |
|
726 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
727 | 0 | pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
728 | 0 | intptr_t picStride = reconPic->m_stride; |
729 | |
|
730 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
731 | 0 | if (numSig) |
732 | 0 | { |
733 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig); |
734 | 0 | bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0; |
735 | 0 | bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
736 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0; |
737 | 0 | bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign; |
738 | 0 | primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride); |
739 | 0 | cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); |
740 | 0 | } |
741 | 0 | else |
742 | 0 | { |
743 | 0 | primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride); |
744 | 0 | cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
745 | 0 | } |
746 | 0 | } |
747 | 0 | else |
748 | 0 | { |
749 | 0 | X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n"); |
750 | | |
751 | | /* code split block */ |
752 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
753 | 0 | uint32_t cbf = 0; |
754 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
755 | 0 | { |
756 | 0 | residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange); |
757 | 0 | cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
758 | 0 | } |
759 | 0 | cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth); |
760 | 0 | } |
761 | 0 | } |
762 | | |
763 | | void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx) |
764 | 0 | { |
765 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
766 | |
|
767 | 0 | if (tuDepth == cu.m_tuDepth[absPartIdx]) |
768 | 0 | { |
769 | 0 | uint32_t qtLayer = log2TrSize - 2; |
770 | | |
771 | | // copy transform coefficients |
772 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
773 | 0 | coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
774 | 0 | coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY; |
775 | 0 | memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2)); |
776 | | |
777 | | // copy reconstruction |
778 | 0 | m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize); |
779 | 0 | } |
780 | 0 | else |
781 | 0 | { |
782 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
783 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
784 | 0 | extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx); |
785 | 0 | } |
786 | 0 | } |
787 | | |
788 | | inline void offsetCBFs(uint8_t subTUCBF[2]) |
789 | 0 | { |
790 | 0 | uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1]; |
791 | 0 | subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF; |
792 | 0 | subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF; |
793 | 0 | } |
794 | | |
795 | | /* 4:2:2 post-TU split processing */ |
796 | | void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx) |
797 | 0 | { |
798 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
799 | |
|
800 | 0 | if (log2TrSize == 2) |
801 | 0 | { |
802 | 0 | X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
803 | 0 | ++log2TrSize; |
804 | 0 | } |
805 | |
|
806 | 0 | uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1); |
807 | | |
808 | | // move the CBFs down a level and set the parent CBF |
809 | 0 | uint8_t subTUCBF[2]; |
810 | 0 | subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth); |
811 | 0 | subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth); |
812 | 0 | offsetCBFs(subTUCBF); |
813 | |
|
814 | 0 | cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts); |
815 | 0 | cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts); |
816 | 0 | } |
817 | | |
818 | | /* returns distortion */ |
819 | | void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost) |
820 | 0 | { |
821 | 0 | CUData& cu = mode.cu; |
822 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
823 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
824 | |
|
825 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
826 | 0 | { |
827 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
828 | 0 | uint32_t splitCbfU = 0, splitCbfV = 0; |
829 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
830 | 0 | { |
831 | 0 | codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost); |
832 | 0 | splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
833 | 0 | splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
834 | 0 | } |
835 | 0 | cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth); |
836 | 0 | cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth); |
837 | |
|
838 | 0 | return; |
839 | 0 | } |
840 | | |
841 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
842 | 0 | uint32_t tuDepthC = tuDepth; |
843 | 0 | if (log2TrSizeC < 2) |
844 | 0 | { |
845 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
846 | 0 | if (absPartIdx & 3) |
847 | 0 | return; |
848 | 0 | log2TrSizeC = 2; |
849 | 0 | tuDepthC--; |
850 | 0 | } |
851 | | |
852 | 0 | if (bEnableRDOQ) |
853 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); |
854 | |
|
855 | 0 | bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; |
856 | 0 | checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]); |
857 | 0 | if (checkTransformSkip) |
858 | 0 | { |
859 | 0 | codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost); |
860 | 0 | return; |
861 | 0 | } |
862 | | |
863 | 0 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; |
864 | 0 | uint32_t qtLayer = log2TrSize - 2; |
865 | 0 | uint32_t stride = mode.fencYuv->m_csize; |
866 | 0 | const uint32_t sizeIdxC = log2TrSizeC - 2; |
867 | |
|
868 | 0 | uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; |
869 | 0 | const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; |
870 | |
|
871 | 0 | TURecurse tuIterator(splitType, curPartNum, absPartIdx); |
872 | 0 | do |
873 | 0 | { |
874 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
875 | |
|
876 | 0 | IntraNeighbors intraNeighbors; |
877 | 0 | initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors); |
878 | |
|
879 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
880 | 0 | { |
881 | 0 | TextType ttype = (TextType)chromaId; |
882 | |
|
883 | 0 | const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); |
884 | 0 | pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); |
885 | 0 | int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
886 | 0 | uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
887 | 0 | coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
888 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); |
889 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; |
890 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
891 | 0 | pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); |
892 | 0 | intptr_t picStride = reconPic->m_strideC; |
893 | |
|
894 | 0 | uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; |
895 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
896 | 0 | chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; |
897 | 0 | if (m_csp == X265_CSP_I422) |
898 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
899 | | |
900 | | // init availability pattern |
901 | 0 | initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); |
902 | | |
903 | | // get prediction signal |
904 | 0 | predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); |
905 | 0 | cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
906 | |
|
907 | 0 | primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
908 | |
|
909 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); |
910 | 0 | if (numSig) |
911 | 0 | { |
912 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); |
913 | 0 | bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
914 | 0 | bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
915 | 0 | bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
916 | 0 | bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0); |
917 | 0 | primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride); |
918 | 0 | cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
919 | 0 | } |
920 | 0 | else |
921 | 0 | { |
922 | | // no coded residual, recon = pred |
923 | 0 | primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride); |
924 | 0 | cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
925 | 0 | } |
926 | |
|
927 | 0 | outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride)); |
928 | |
|
929 | 0 | if (m_rdCost.m_psyRd) |
930 | 0 | outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride); |
931 | 0 | else if(m_rdCost.m_ssimRd) |
932 | 0 | outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC); |
933 | |
|
934 | 0 | primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride); |
935 | 0 | } |
936 | 0 | } |
937 | 0 | while (tuIterator.isNextSection()); |
938 | |
|
939 | 0 | if (splitType == VERTICAL_SPLIT) |
940 | 0 | { |
941 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
942 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
943 | 0 | } |
944 | 0 | } |
945 | | |
946 | | /* returns distortion */ |
947 | | void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost) |
948 | 0 | { |
949 | 0 | CUData& cu = mode.cu; |
950 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
951 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
952 | 0 | const uint32_t log2TrSizeC = 2; |
953 | 0 | uint32_t qtLayer = log2TrSize - 2; |
954 | | |
955 | | /* At the TU layers above this one, no RDO is performed, only distortion is being measured, |
956 | | * so the entropy coder is not very accurate. The best we can do is return it in the same |
957 | | * condition as it arrived, and to do all bit estimates from the same state. */ |
958 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); |
959 | |
|
960 | 0 | uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; |
961 | 0 | const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; |
962 | |
|
963 | 0 | TURecurse tuIterator(splitType, curPartNum, absPartIdx); |
964 | 0 | do |
965 | 0 | { |
966 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
967 | |
|
968 | 0 | IntraNeighbors intraNeighbors; |
969 | 0 | initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors); |
970 | |
|
971 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
972 | 0 | { |
973 | 0 | TextType ttype = (TextType)chromaId; |
974 | |
|
975 | 0 | const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); |
976 | 0 | pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); |
977 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); |
978 | 0 | uint32_t stride = mode.fencYuv->m_csize; |
979 | 0 | const uint32_t sizeIdxC = log2TrSizeC - 2; |
980 | |
|
981 | 0 | uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
982 | 0 | coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
983 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); |
984 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; |
985 | | |
986 | | // init availability pattern |
987 | 0 | initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); |
988 | |
|
989 | 0 | uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; |
990 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
991 | 0 | chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; |
992 | 0 | if (m_csp == X265_CSP_I422) |
993 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
994 | | |
995 | | // get prediction signal |
996 | 0 | predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); |
997 | |
|
998 | 0 | uint64_t bCost = MAX_INT64; |
999 | 0 | sse_t bDist = 0; |
1000 | 0 | uint32_t bCbf = 0; |
1001 | 0 | uint32_t bEnergy = 0; |
1002 | 0 | int bTSkip = 0; |
1003 | |
|
1004 | 0 | int checkTransformSkip = 1; |
1005 | 0 | for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) |
1006 | 0 | { |
1007 | 0 | coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC); |
1008 | 0 | pixel* recon = (useTSkip ? m_tsRecon : reconQt); |
1009 | 0 | uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); |
1010 | |
|
1011 | 0 | primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
1012 | |
|
1013 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip); |
1014 | 0 | if (numSig) |
1015 | 0 | { |
1016 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig); |
1017 | 0 | bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0; |
1018 | 0 | bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
1019 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
1020 | 0 | bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0); |
1021 | 0 | primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride); |
1022 | 0 | cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1023 | 0 | } |
1024 | 0 | else if (useTSkip) |
1025 | 0 | { |
1026 | 0 | checkTransformSkip = 0; |
1027 | 0 | break; |
1028 | 0 | } |
1029 | 0 | else |
1030 | 0 | { |
1031 | 0 | primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride); |
1032 | 0 | cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1033 | 0 | } |
1034 | 0 | sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride); |
1035 | 0 | tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist); |
1036 | |
|
1037 | 0 | cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1038 | |
|
1039 | 0 | uint32_t tmpBits = 0, tmpEnergy = 0; |
1040 | 0 | if (numSig) |
1041 | 0 | { |
1042 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); |
1043 | 0 | m_entropyCoder.resetBits(); |
1044 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId); |
1045 | 0 | tmpBits = m_entropyCoder.getNumberOfWrittenBits(); |
1046 | 0 | } |
1047 | |
|
1048 | 0 | uint64_t tmpCost; |
1049 | 0 | if (m_rdCost.m_psyRd) |
1050 | 0 | { |
1051 | 0 | tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride); |
1052 | 0 | tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy); |
1053 | 0 | } |
1054 | 0 | else if(m_rdCost.m_ssimRd) |
1055 | 0 | { |
1056 | 0 | tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC); |
1057 | 0 | tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy); |
1058 | 0 | } |
1059 | 0 | else |
1060 | 0 | tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits); |
1061 | |
|
1062 | 0 | if (tmpCost < bCost) |
1063 | 0 | { |
1064 | 0 | bCost = tmpCost; |
1065 | 0 | bDist = tmpDist; |
1066 | 0 | bTSkip = useTSkip; |
1067 | 0 | bCbf = !!numSig; |
1068 | 0 | bEnergy = tmpEnergy; |
1069 | 0 | } |
1070 | 0 | } |
1071 | |
|
1072 | 0 | if (bTSkip) |
1073 | 0 | { |
1074 | 0 | memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2)); |
1075 | 0 | primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE); |
1076 | 0 | } |
1077 | |
|
1078 | 0 | cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1079 | 0 | cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1080 | |
|
1081 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
1082 | 0 | pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); |
1083 | 0 | intptr_t picStride = reconPic->m_strideC; |
1084 | 0 | primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride); |
1085 | |
|
1086 | 0 | outCost.distortion += bDist; |
1087 | 0 | outCost.energy += bEnergy; |
1088 | 0 | } |
1089 | 0 | } |
1090 | 0 | while (tuIterator.isNextSection()); |
1091 | |
|
1092 | 0 | if (splitType == VERTICAL_SPLIT) |
1093 | 0 | { |
1094 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
1095 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
1096 | 0 | } |
1097 | |
|
1098 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); |
1099 | 0 | } |
1100 | | |
1101 | | void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth) |
1102 | 0 | { |
1103 | 0 | uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; |
1104 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
1105 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
1106 | |
|
1107 | 0 | if (tuDepthL == tuDepth || log2TrSizeC == 2) |
1108 | 0 | { |
1109 | | // copy transform coefficients |
1110 | 0 | uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); |
1111 | 0 | uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
1112 | |
|
1113 | 0 | uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth); |
1114 | 0 | coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; |
1115 | 0 | coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; |
1116 | 0 | coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; |
1117 | 0 | coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC; |
1118 | 0 | memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC); |
1119 | 0 | memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC); |
1120 | | |
1121 | | // copy reconstruction |
1122 | 0 | m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift); |
1123 | 0 | } |
1124 | 0 | else |
1125 | 0 | { |
1126 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
1127 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
1128 | 0 | extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1); |
1129 | 0 | } |
1130 | 0 | } |
1131 | | |
1132 | | void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth) |
1133 | 0 | { |
1134 | 0 | CUData& cu = mode.cu; |
1135 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth; |
1136 | |
|
1137 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
1138 | 0 | { |
1139 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
1140 | 0 | uint32_t splitCbfU = 0, splitCbfV = 0; |
1141 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
1142 | 0 | { |
1143 | 0 | residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1); |
1144 | 0 | splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
1145 | 0 | splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
1146 | 0 | } |
1147 | 0 | cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth); |
1148 | 0 | cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth); |
1149 | |
|
1150 | 0 | return; |
1151 | 0 | } |
1152 | | |
1153 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
1154 | 0 | uint32_t tuDepthC = tuDepth; |
1155 | 0 | if (log2TrSizeC < 2) |
1156 | 0 | { |
1157 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
1158 | 0 | if (absPartIdx & 3) |
1159 | 0 | return; |
1160 | 0 | log2TrSizeC = 2; |
1161 | 0 | tuDepthC--; |
1162 | 0 | } |
1163 | | |
1164 | 0 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; |
1165 | 0 | uint32_t stride = mode.fencYuv->m_csize; |
1166 | 0 | const uint32_t sizeIdxC = log2TrSizeC - 2; |
1167 | |
|
1168 | 0 | uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; |
1169 | 0 | const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; |
1170 | |
|
1171 | 0 | TURecurse tuIterator(splitType, curPartNum, absPartIdx); |
1172 | 0 | do |
1173 | 0 | { |
1174 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
1175 | |
|
1176 | 0 | IntraNeighbors intraNeighbors; |
1177 | 0 | initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors); |
1178 | |
|
1179 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
1180 | 0 | { |
1181 | 0 | TextType ttype = (TextType)chromaId; |
1182 | |
|
1183 | 0 | const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); |
1184 | 0 | pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); |
1185 | 0 | int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
1186 | 0 | uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
1187 | 0 | coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC; |
1188 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
1189 | 0 | pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); |
1190 | 0 | intptr_t picStride = reconPic->m_strideC; |
1191 | |
|
1192 | 0 | uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; |
1193 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
1194 | 0 | chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; |
1195 | 0 | if (m_csp == X265_CSP_I422) |
1196 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
1197 | | |
1198 | | // init availability pattern |
1199 | 0 | initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); |
1200 | | |
1201 | | // get prediction signal |
1202 | 0 | predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); |
1203 | |
|
1204 | 0 | X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n"); |
1205 | |
|
1206 | 0 | primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
1207 | |
|
1208 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); |
1209 | 0 | if (numSig) |
1210 | 0 | { |
1211 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); |
1212 | 0 | bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0; |
1213 | 0 | bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
1214 | 0 | bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0; |
1215 | 0 | bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0); |
1216 | 0 | primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride); |
1217 | 0 | cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1218 | 0 | } |
1219 | 0 | else |
1220 | 0 | { |
1221 | | // no coded residual, recon = pred |
1222 | 0 | primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride); |
1223 | 0 | cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1224 | 0 | } |
1225 | 0 | } |
1226 | 0 | } |
1227 | 0 | while (tuIterator.isNextSection()); |
1228 | |
|
1229 | 0 | if (splitType == VERTICAL_SPLIT) |
1230 | 0 | { |
1231 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
1232 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
1233 | 0 | } |
1234 | 0 | } |
1235 | | |
1236 | | void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize) |
1237 | 0 | { |
1238 | 0 | CUData& cu = intraMode.cu; |
1239 | |
|
1240 | 0 | cu.setPartSizeSubParts(partSize); |
1241 | 0 | cu.setPredModeSubParts(MODE_INTRA); |
1242 | |
|
1243 | 0 | uint32_t tuDepthRange[2]; |
1244 | 0 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); |
1245 | |
|
1246 | 0 | intraMode.initCosts(); |
1247 | 0 | intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange); |
1248 | 0 | if (m_csp != X265_CSP_I400) |
1249 | 0 | { |
1250 | 0 | intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom); |
1251 | 0 | intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion; |
1252 | 0 | } |
1253 | 0 | else |
1254 | 0 | intraMode.distortion += intraMode.lumaDistortion; |
1255 | 0 | cu.m_distortion[0] = intraMode.distortion; |
1256 | 0 | m_entropyCoder.resetBits(); |
1257 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
1258 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
1259 | |
|
1260 | 0 | int skipFlagBits = 0; |
1261 | 0 | if (!m_slice->isIntra()) |
1262 | 0 | { |
1263 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
1264 | 0 | skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
1265 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
1266 | 0 | } |
1267 | |
|
1268 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
1269 | 0 | m_entropyCoder.codePredInfo(cu, 0); |
1270 | 0 | intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
1271 | |
|
1272 | 0 | bool bCodeDQP = m_slice->m_pps->bUseDQP; |
1273 | 0 | m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); |
1274 | 0 | m_entropyCoder.store(intraMode.contexts); |
1275 | 0 | intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); |
1276 | 0 | intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits; |
1277 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1278 | 0 | if (m_rdCost.m_psyRd) |
1279 | 0 | intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size); |
1280 | 0 | else if(m_rdCost.m_ssimRd) |
1281 | 0 | intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0); |
1282 | |
|
1283 | 0 | intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size); |
1284 | |
|
1285 | 0 | updateModeCost(intraMode); |
1286 | 0 | checkDQP(intraMode, cuGeom); |
1287 | 0 | } |
1288 | | |
1289 | | /* Note that this function does not save the best intra prediction, it must |
1290 | | * be generated later. It records the best mode in the cu */ |
1291 | | void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom) |
1292 | 0 | { |
1293 | 0 | ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis); |
1294 | |
|
1295 | 0 | CUData& cu = intraMode.cu; |
1296 | 0 | uint32_t depth = cuGeom.depth; |
1297 | |
|
1298 | 0 | cu.setPartSizeSubParts(SIZE_2Nx2N); |
1299 | 0 | cu.setPredModeSubParts(MODE_INTRA); |
1300 | |
|
1301 | 0 | const uint32_t initTuDepth = 0; |
1302 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth; |
1303 | 0 | uint32_t tuSize = 1 << log2TrSize; |
1304 | 0 | const uint32_t absPartIdx = 0; |
1305 | | |
1306 | | // Reference sample smoothing |
1307 | 0 | IntraNeighbors intraNeighbors; |
1308 | 0 | initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors); |
1309 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX); |
1310 | |
|
1311 | 0 | const pixel* fenc = intraMode.fencYuv->m_buf[0]; |
1312 | 0 | uint32_t stride = intraMode.fencYuv->m_size; |
1313 | |
|
1314 | 0 | int sad, bsad; |
1315 | 0 | uint32_t bits, bbits, mode, bmode; |
1316 | 0 | uint64_t cost, bcost; |
1317 | | |
1318 | | // 33 Angle modes once |
1319 | 0 | int scaleTuSize = tuSize; |
1320 | 0 | int scaleStride = stride; |
1321 | 0 | int costShift = 0; |
1322 | 0 | int sizeIdx = log2TrSize - 2; |
1323 | |
|
1324 | 0 | if (tuSize > 32) |
1325 | 0 | { |
1326 | | // CU is 64x64, we scale to 32x32 and adjust required parameters |
1327 | 0 | primitives.scale2D_64to32(m_fencScaled, fenc, stride); |
1328 | 0 | fenc = m_fencScaled; |
1329 | |
|
1330 | 0 | pixel nScale[129]; |
1331 | 0 | intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0]; |
1332 | 0 | primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1); |
1333 | | |
1334 | | // we do not estimate filtering for downscaled samples |
1335 | 0 | memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel)); // Top & Left pixels |
1336 | 0 | memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel)); |
1337 | |
|
1338 | 0 | scaleTuSize = 32; |
1339 | 0 | scaleStride = 32; |
1340 | 0 | costShift = 2; |
1341 | 0 | sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 |
1342 | 0 | } |
1343 | |
|
1344 | 0 | pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d; |
1345 | 0 | int predsize = scaleTuSize * scaleTuSize; |
1346 | |
|
1347 | 0 | m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); |
1348 | | |
1349 | | /* there are three cost tiers for intra modes: |
1350 | | * pred[0] - mode probable, least cost |
1351 | | * pred[1], pred[2] - less probable, slightly more cost |
1352 | | * non-mpm modes - all cost the same (rbits) */ |
1353 | 0 | uint64_t mpms; |
1354 | 0 | uint32_t mpmModes[3]; |
1355 | 0 | uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms); |
1356 | | |
1357 | | // DC |
1358 | 0 | primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); |
1359 | 0 | bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift; |
1360 | 0 | bmode = mode = DC_IDX; |
1361 | 0 | bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1362 | 0 | bcost = m_rdCost.calcRdSADCost(bsad, bbits); |
1363 | | |
1364 | | // PLANAR |
1365 | 0 | pixel* planar = intraNeighbourBuf[0]; |
1366 | 0 | if (tuSize & (8 | 16 | 32)) |
1367 | 0 | planar = intraNeighbourBuf[1]; |
1368 | |
|
1369 | 0 | primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0); |
1370 | 0 | sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift; |
1371 | 0 | mode = PLANAR_IDX; |
1372 | 0 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1373 | 0 | cost = m_rdCost.calcRdSADCost(sad, bits); |
1374 | 0 | COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); |
1375 | |
|
1376 | 0 | bool allangs = true; |
1377 | 0 | if (primitives.cu[sizeIdx].intra_pred_allangs) |
1378 | 0 | { |
1379 | 0 | primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride); |
1380 | 0 | primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); |
1381 | 0 | } |
1382 | 0 | else |
1383 | 0 | allangs = false; |
1384 | |
|
1385 | 0 | #define TRY_ANGLE(angle) \ |
1386 | 0 | if (allangs) { \ |
1387 | 0 | if (angle < 18) \ |
1388 | 0 | sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \ |
1389 | 0 | else \ |
1390 | 0 | sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \ |
1391 | 0 | bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \ |
1392 | 0 | cost = m_rdCost.calcRdSADCost(sad, bits); \ |
1393 | 0 | } else { \ |
1394 | 0 | int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \ |
1395 | 0 | primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \ |
1396 | 0 | sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \ |
1397 | 0 | bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \ |
1398 | 0 | cost = m_rdCost.calcRdSADCost(sad, bits); \ |
1399 | 0 | } |
1400 | |
|
1401 | 0 | if (m_param->bEnableFastIntra) |
1402 | 0 | { |
1403 | 0 | int asad = 0; |
1404 | 0 | uint32_t lowmode, highmode, amode = 5, abits = 0; |
1405 | 0 | uint64_t acost = MAX_INT64; |
1406 | | |
1407 | | /* pick the best angle, sampling at distance of 5 */ |
1408 | 0 | for (mode = 5; mode < 35; mode += 5) |
1409 | 0 | { |
1410 | 0 | TRY_ANGLE(mode); |
1411 | 0 | COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); |
1412 | 0 | } |
1413 | | |
1414 | | /* refine best angle at distance 2, then distance 1 */ |
1415 | 0 | for (uint32_t dist = 2; dist >= 1; dist--) |
1416 | 0 | { |
1417 | 0 | lowmode = amode - dist; |
1418 | 0 | highmode = amode + dist; |
1419 | |
|
1420 | 0 | X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); |
1421 | 0 | TRY_ANGLE(lowmode); |
1422 | 0 | COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); |
1423 | |
|
1424 | 0 | X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); |
1425 | 0 | TRY_ANGLE(highmode); |
1426 | 0 | COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); |
1427 | 0 | } |
1428 | |
|
1429 | 0 | if (amode == 33) |
1430 | 0 | { |
1431 | 0 | TRY_ANGLE(34); |
1432 | 0 | COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); |
1433 | 0 | } |
1434 | |
|
1435 | 0 | COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); |
1436 | 0 | } |
1437 | 0 | else // calculate and search all intra prediction angles for lowest cost |
1438 | 0 | { |
1439 | 0 | for (mode = 2; mode < 35; mode++) |
1440 | 0 | { |
1441 | 0 | TRY_ANGLE(mode); |
1442 | 0 | COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); |
1443 | 0 | } |
1444 | 0 | } |
1445 | |
|
1446 | 0 | cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth); |
1447 | 0 | intraMode.initCosts(); |
1448 | 0 | intraMode.totalBits = bbits; |
1449 | 0 | intraMode.distortion = bsad; |
1450 | 0 | intraMode.sa8dCost = bcost; |
1451 | 0 | intraMode.sa8dBits = bbits; |
1452 | 0 | } |
1453 | | |
1454 | | void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) |
1455 | 0 | { |
1456 | 0 | ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); |
1457 | |
|
1458 | 0 | CUData& cu = intraMode.cu; |
1459 | 0 | Yuv* reconYuv = &intraMode.reconYuv; |
1460 | |
|
1461 | 0 | X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); |
1462 | 0 | X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); |
1463 | |
|
1464 | 0 | uint32_t tuDepthRange[2]; |
1465 | 0 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); |
1466 | |
|
1467 | 0 | m_entropyCoder.load(m_rqt[cuGeom.depth].cur); |
1468 | |
|
1469 | 0 | Cost icosts; |
1470 | 0 | codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); |
1471 | 0 | extractIntraResultQT(cu, *reconYuv, 0, 0); |
1472 | |
|
1473 | 0 | intraMode.lumaDistortion = icosts.distortion; |
1474 | 0 | if (m_csp != X265_CSP_I400) |
1475 | 0 | { |
1476 | 0 | intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom); |
1477 | 0 | intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion; |
1478 | 0 | } |
1479 | 0 | else |
1480 | 0 | intraMode.distortion = intraMode.lumaDistortion; |
1481 | |
|
1482 | 0 | m_entropyCoder.resetBits(); |
1483 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
1484 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
1485 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
1486 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
1487 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
1488 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
1489 | 0 | m_entropyCoder.codePredInfo(cu, 0); |
1490 | 0 | intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
1491 | |
|
1492 | 0 | bool bCodeDQP = m_slice->m_pps->bUseDQP; |
1493 | 0 | m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); |
1494 | |
|
1495 | 0 | intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); |
1496 | 0 | intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits; |
1497 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1498 | 0 | if (m_rdCost.m_psyRd) |
1499 | 0 | intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
1500 | 0 | else if(m_rdCost.m_ssimRd) |
1501 | 0 | intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0); |
1502 | |
|
1503 | 0 | intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size); |
1504 | 0 | m_entropyCoder.store(intraMode.contexts); |
1505 | 0 | updateModeCost(intraMode); |
1506 | 0 | checkDQP(intraMode, cuGeom); |
1507 | 0 | } |
1508 | | |
1509 | | sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]) |
1510 | 0 | { |
1511 | 0 | CUData& cu = intraMode.cu; |
1512 | 0 | Yuv* reconYuv = &intraMode.reconYuv; |
1513 | 0 | Yuv* predYuv = &intraMode.predYuv; |
1514 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1515 | |
|
1516 | 0 | uint32_t depth = cuGeom.depth; |
1517 | 0 | uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; |
1518 | 0 | uint32_t numPU = 1 << (2 * initTuDepth); |
1519 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth; |
1520 | 0 | uint32_t tuSize = 1 << log2TrSize; |
1521 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
1522 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
1523 | 0 | uint32_t absPartIdx = 0; |
1524 | 0 | sse_t totalDistortion = 0; |
1525 | |
|
1526 | 0 | int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N; |
1527 | | |
1528 | | // loop over partitions |
1529 | 0 | for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts) |
1530 | 0 | { |
1531 | 0 | uint32_t bmode = 0; |
1532 | |
|
1533 | 0 | if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX) |
1534 | 0 | bmode = intraMode.cu.m_lumaIntraDir[puIdx]; |
1535 | 0 | else |
1536 | 0 | { |
1537 | 0 | uint64_t candCostList[MAX_RD_INTRA_MODES]; |
1538 | 0 | uint32_t rdModeList[MAX_RD_INTRA_MODES]; |
1539 | 0 | uint64_t bcost; |
1540 | 0 | int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1); |
1541 | |
|
1542 | 0 | { |
1543 | 0 | ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis); |
1544 | | |
1545 | | // Reference sample smoothing |
1546 | 0 | IntraNeighbors intraNeighbors; |
1547 | 0 | initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors); |
1548 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX); |
1549 | | |
1550 | | // determine set of modes to be tested (using prediction signal only) |
1551 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
1552 | 0 | uint32_t stride = predYuv->m_size; |
1553 | |
|
1554 | 0 | int scaleTuSize = tuSize; |
1555 | 0 | int scaleStride = stride; |
1556 | 0 | int costShift = 0; |
1557 | |
|
1558 | 0 | m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); |
1559 | | |
1560 | | /* there are three cost tiers for intra modes: |
1561 | | * pred[0] - mode probable, least cost |
1562 | | * pred[1], pred[2] - less probable, slightly more cost |
1563 | | * non-mpm modes - all cost the same (rbits) */ |
1564 | 0 | uint64_t mpms; |
1565 | 0 | uint32_t mpmModes[3]; |
1566 | 0 | uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms); |
1567 | |
|
1568 | 0 | pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d; |
1569 | 0 | uint64_t modeCosts[35]; |
1570 | | |
1571 | | // DC |
1572 | 0 | primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); |
1573 | 0 | uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits; |
1574 | 0 | uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift; |
1575 | 0 | modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); |
1576 | | |
1577 | | // PLANAR |
1578 | 0 | pixel* planar = intraNeighbourBuf[0]; |
1579 | 0 | if (tuSize >= 8 && tuSize <= 32) |
1580 | 0 | planar = intraNeighbourBuf[1]; |
1581 | |
|
1582 | 0 | primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0); |
1583 | 0 | bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits; |
1584 | 0 | sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift; |
1585 | 0 | modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits); |
1586 | 0 | COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]); |
1587 | | |
1588 | | // angular predictions |
1589 | 0 | if (primitives.cu[sizeIdx].intra_pred_allangs) |
1590 | 0 | { |
1591 | 0 | primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride); |
1592 | 0 | primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); |
1593 | 0 | for (int mode = 2; mode < 35; mode++) |
1594 | 0 | { |
1595 | 0 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1596 | 0 | if (mode < 18) |
1597 | 0 | sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; |
1598 | 0 | else |
1599 | 0 | sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; |
1600 | 0 | modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); |
1601 | 0 | COPY1_IF_LT(bcost, modeCosts[mode]); |
1602 | 0 | } |
1603 | 0 | } |
1604 | 0 | else |
1605 | 0 | { |
1606 | 0 | for (int mode = 2; mode < 35; mode++) |
1607 | 0 | { |
1608 | 0 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1609 | 0 | int filter = !!(g_intraFilterFlags[mode] & scaleTuSize); |
1610 | 0 | primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16); |
1611 | 0 | sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift; |
1612 | 0 | modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); |
1613 | 0 | COPY1_IF_LT(bcost, modeCosts[mode]); |
1614 | 0 | } |
1615 | 0 | } |
1616 | | |
1617 | | /* Find the top maxCandCount candidate modes with cost within 25% of best |
1618 | | * or among the most probable modes. maxCandCount is derived from the |
1619 | | * rdLevel and depth. In general we want to try more modes at slower RD |
1620 | | * levels and at higher depths */ |
1621 | 0 | for (int i = 0; i < maxCandCount; i++) |
1622 | 0 | candCostList[i] = MAX_INT64; |
1623 | |
|
1624 | 0 | uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25% |
1625 | 0 | for (int mode = 0; mode < 35; mode++) |
1626 | 0 | if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) |
1627 | | /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */ |
1628 | 0 | updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList); |
1629 | 0 | } |
1630 | | |
1631 | | /* measure best candidates using simple RDO (no TU splits) */ |
1632 | 0 | bcost = MAX_INT64; |
1633 | 0 | for (int i = 0; i < maxCandCount; i++) |
1634 | 0 | { |
1635 | 0 | if (candCostList[i] == MAX_INT64) |
1636 | 0 | break; |
1637 | | |
1638 | 0 | ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); |
1639 | |
|
1640 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1641 | 0 | cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth); |
1642 | |
|
1643 | 0 | Cost icosts; |
1644 | 0 | if (checkTransformSkip) |
1645 | 0 | codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); |
1646 | 0 | else |
1647 | 0 | codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange); |
1648 | 0 | COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]); |
1649 | 0 | } |
1650 | 0 | } |
1651 | |
|
1652 | 0 | ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); |
1653 | | |
1654 | | /* remeasure best mode, allowing TU splits */ |
1655 | 0 | cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth); |
1656 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1657 | |
|
1658 | 0 | Cost icosts; |
1659 | 0 | if (checkTransformSkip) |
1660 | 0 | codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); |
1661 | 0 | else |
1662 | 0 | codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange); |
1663 | 0 | totalDistortion += icosts.distortion; |
1664 | |
|
1665 | 0 | extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx); |
1666 | | |
1667 | | // set reconstruction for next intra prediction blocks |
1668 | 0 | if (puIdx != numPU - 1) |
1669 | 0 | { |
1670 | | /* This has important implications for parallelism and RDO. It is writing intermediate results into the |
1671 | | * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also |
1672 | | * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think |
1673 | | * that the contexts should be tracked through each PU */ |
1674 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
1675 | 0 | pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
1676 | 0 | uint32_t dststride = reconPic->m_stride; |
1677 | 0 | const pixel* src = reconYuv->getLumaAddr(absPartIdx); |
1678 | 0 | uint32_t srcstride = reconYuv->m_size; |
1679 | 0 | primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride); |
1680 | 0 | } |
1681 | 0 | } |
1682 | |
|
1683 | 0 | if (numPU > 1) |
1684 | 0 | { |
1685 | 0 | uint32_t combCbfY = 0; |
1686 | 0 | for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
1687 | 0 | combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1); |
1688 | |
|
1689 | 0 | cu.m_cbf[0][0] |= combCbfY; |
1690 | 0 | } |
1691 | | |
1692 | | // TODO: remove this |
1693 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1694 | |
|
1695 | 0 | return totalDistortion; |
1696 | 0 | } |
1697 | | |
1698 | | void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) |
1699 | 0 | { |
1700 | 0 | CUData& cu = intraMode.cu; |
1701 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1702 | 0 | Yuv* predYuv = &intraMode.predYuv; |
1703 | |
|
1704 | 0 | uint32_t bestMode = 0; |
1705 | 0 | uint64_t bestCost = MAX_INT64; |
1706 | 0 | uint32_t modeList[NUM_CHROMA_MODE]; |
1707 | |
|
1708 | 0 | uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift; |
1709 | 0 | uint32_t tuSize = 1 << log2TrSizeC; |
1710 | 0 | uint32_t tuDepth = 0; |
1711 | 0 | int32_t costShift = 0; |
1712 | |
|
1713 | 0 | if (tuSize > 32) |
1714 | 0 | { |
1715 | 0 | tuDepth = 1; |
1716 | 0 | costShift = 2; |
1717 | 0 | log2TrSizeC = 5; |
1718 | 0 | } |
1719 | |
|
1720 | 0 | IntraNeighbors intraNeighbors; |
1721 | 0 | initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors); |
1722 | 0 | cu.getAllowedChromaDir(0, modeList); |
1723 | | |
1724 | | // check chroma modes |
1725 | 0 | for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++) |
1726 | 0 | { |
1727 | 0 | uint32_t chromaPredMode = modeList[mode]; |
1728 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
1729 | 0 | chromaPredMode = cu.m_lumaIntraDir[0]; |
1730 | 0 | if (m_csp == X265_CSP_I422) |
1731 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
1732 | |
|
1733 | 0 | uint64_t cost = 0; |
1734 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
1735 | 0 | { |
1736 | 0 | const pixel* fenc = fencYuv->m_buf[chromaId]; |
1737 | 0 | pixel* pred = predYuv->m_buf[chromaId]; |
1738 | 0 | Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId); |
1739 | | // get prediction signal |
1740 | 0 | predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC); |
1741 | 0 | cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift; |
1742 | 0 | } |
1743 | |
|
1744 | 0 | if (cost < bestCost) |
1745 | 0 | { |
1746 | 0 | bestCost = cost; |
1747 | 0 | bestMode = modeList[mode]; |
1748 | 0 | } |
1749 | 0 | } |
1750 | |
|
1751 | 0 | cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth); |
1752 | 0 | } |
1753 | | |
1754 | | sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) |
1755 | 0 | { |
1756 | 0 | CUData& cu = intraMode.cu; |
1757 | 0 | Yuv& reconYuv = intraMode.reconYuv; |
1758 | |
|
1759 | 0 | uint32_t depth = cuGeom.depth; |
1760 | 0 | uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444; |
1761 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth; |
1762 | 0 | uint32_t absPartStep = cuGeom.numPartitions; |
1763 | 0 | sse_t totalDistortion = 0; |
1764 | |
|
1765 | 0 | int size = partitionFromLog2Size(log2TrSize); |
1766 | |
|
1767 | 0 | TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); |
1768 | |
|
1769 | 0 | do |
1770 | 0 | { |
1771 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
1772 | |
|
1773 | 0 | uint32_t bestMode = 0; |
1774 | 0 | sse_t bestDist = 0; |
1775 | 0 | uint64_t bestCost = MAX_INT64; |
1776 | | |
1777 | | // init mode list |
1778 | 0 | uint32_t minMode = 0; |
1779 | 0 | uint32_t maxMode = NUM_CHROMA_MODE; |
1780 | 0 | uint32_t modeList[NUM_CHROMA_MODE]; |
1781 | |
|
1782 | 0 | if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth) |
1783 | 0 | { |
1784 | 0 | for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++) |
1785 | 0 | modeList[l] = intraMode.cu.m_chromaIntraDir[0]; |
1786 | 0 | maxMode = 1; |
1787 | 0 | } |
1788 | 0 | else |
1789 | 0 | cu.getAllowedChromaDir(absPartIdxC, modeList); |
1790 | |
|
1791 | 0 | if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400) |
1792 | 0 | { |
1793 | 0 | for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++) |
1794 | 0 | modeList[l] = modeList[0]; |
1795 | 0 | maxMode = 1; |
1796 | 0 | } |
1797 | | // check chroma modes |
1798 | 0 | for (uint32_t mode = minMode; mode < maxMode; mode++) |
1799 | 0 | { |
1800 | | // restore context models |
1801 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1802 | |
|
1803 | 0 | cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth); |
1804 | 0 | Cost outCost; |
1805 | 0 | codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost); |
1806 | |
|
1807 | 0 | if (m_slice->m_pps->bTransformSkipEnabled) |
1808 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1809 | |
|
1810 | 0 | m_entropyCoder.resetBits(); |
1811 | | // chroma prediction mode |
1812 | 0 | if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444) |
1813 | 0 | { |
1814 | 0 | if (!absPartIdxC) |
1815 | 0 | m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); |
1816 | 0 | } |
1817 | 0 | else |
1818 | 0 | { |
1819 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
1820 | 0 | if (!(absPartIdxC & (qNumParts - 1))) |
1821 | 0 | m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); |
1822 | 0 | } |
1823 | |
|
1824 | 0 | codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC); |
1825 | 0 | codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U); |
1826 | 0 | codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V); |
1827 | 0 | uint32_t bits = m_entropyCoder.getNumberOfWrittenBits(); |
1828 | 0 | uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy) |
1829 | 0 | : m_rdCost.calcRdCost(outCost.distortion, bits); |
1830 | |
|
1831 | 0 | if (cost < bestCost) |
1832 | 0 | { |
1833 | 0 | bestCost = cost; |
1834 | 0 | bestDist = outCost.distortion; |
1835 | 0 | bestMode = modeList[mode]; |
1836 | 0 | extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth); |
1837 | 0 | memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1838 | 0 | memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1839 | 0 | memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1840 | 0 | memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1841 | 0 | } |
1842 | 0 | } |
1843 | |
|
1844 | 0 | if (!tuIterator.isLastSection()) |
1845 | 0 | { |
1846 | 0 | uint32_t zorder = cuGeom.absPartIdx + absPartIdxC; |
1847 | 0 | PicYuv* reconPic = m_frame->m_reconPic; |
1848 | 0 | uint32_t dststride = reconPic->m_strideC; |
1849 | 0 | const pixel* src; |
1850 | 0 | pixel* dst; |
1851 | |
|
1852 | 0 | dst = reconPic->getCbAddr(cu.m_cuAddr, zorder); |
1853 | 0 | src = reconYuv.getCbAddr(absPartIdxC); |
1854 | 0 | primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize); |
1855 | |
|
1856 | 0 | dst = reconPic->getCrAddr(cu.m_cuAddr, zorder); |
1857 | 0 | src = reconYuv.getCrAddr(absPartIdxC); |
1858 | 0 | primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize); |
1859 | 0 | } |
1860 | |
|
1861 | 0 | memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1862 | 0 | memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1863 | 0 | memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1864 | 0 | memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1865 | 0 | cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth); |
1866 | 0 | totalDistortion += bestDist; |
1867 | 0 | } |
1868 | 0 | while (tuIterator.isNextSection()); |
1869 | |
|
1870 | 0 | if (initTuDepth != 0) |
1871 | 0 | { |
1872 | 0 | uint32_t combCbfU = 0; |
1873 | 0 | uint32_t combCbfV = 0; |
1874 | 0 | uint32_t qNumParts = tuIterator.absPartIdxStep; |
1875 | 0 | for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
1876 | 0 | { |
1877 | 0 | combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1); |
1878 | 0 | combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1); |
1879 | 0 | } |
1880 | |
|
1881 | 0 | cu.m_cbf[1][0] |= combCbfU; |
1882 | 0 | cu.m_cbf[2][0] |= combCbfV; |
1883 | 0 | } |
1884 | | |
1885 | | /* TODO: remove this */ |
1886 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1887 | 0 | return totalDistortion; |
1888 | 0 | } |
1889 | | |
1890 | | /* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */ |
1891 | | uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m) |
1892 | 0 | { |
1893 | 0 | X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n"); |
1894 | |
|
1895 | 0 | MVField candMvField[MRG_MAX_NUM_CANDS][2]; |
1896 | 0 | uint8_t candDir[MRG_MAX_NUM_CANDS]; |
1897 | 0 | uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir); |
1898 | |
|
1899 | 0 | if (cu.isBipredRestriction()) |
1900 | 0 | { |
1901 | | /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */ |
1902 | 0 | for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand) |
1903 | 0 | { |
1904 | 0 | if (candDir[mergeCand] == 3) |
1905 | 0 | { |
1906 | 0 | candDir[mergeCand] = 1; |
1907 | 0 | candMvField[mergeCand][1].refIdx = REF_NOT_VALID; |
1908 | 0 | } |
1909 | 0 | } |
1910 | 0 | } |
1911 | |
|
1912 | 0 | Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv; |
1913 | |
|
1914 | 0 | uint32_t outCost = MAX_UINT; |
1915 | 0 | for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand) |
1916 | 0 | { |
1917 | | /* Prevent TMVP candidates from using unavailable reference pixels */ |
1918 | 0 | if (m_bFrameParallel) |
1919 | 0 | { |
1920 | | // Parallel slices bound check |
1921 | 0 | if (m_param->maxSlices > 1) |
1922 | 0 | { |
1923 | 0 | if (cu.m_bFirstRowInSlice & |
1924 | 0 | ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4)))) |
1925 | 0 | continue; |
1926 | | |
1927 | | // Last row in slice can't reference beyond bound since it is another slice area |
1928 | | // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance |
1929 | 0 | if (cu.m_bLastRowInSlice && |
1930 | 0 | ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4))) |
1931 | 0 | continue; |
1932 | 0 | } |
1933 | | |
1934 | 0 | if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 || |
1935 | 0 | candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4) |
1936 | 0 | continue; |
1937 | 0 | } |
1938 | | |
1939 | 0 | cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv; |
1940 | 0 | cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx; |
1941 | 0 | cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv; |
1942 | 0 | cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx; |
1943 | |
|
1944 | 0 | motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD); |
1945 | |
|
1946 | 0 | uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size); |
1947 | 0 | if (m_me.bChromaSATD) |
1948 | 0 | costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx); |
1949 | |
|
1950 | 0 | uint32_t bitsCand = getTUBits(mergeCand, numMergeCand); |
1951 | 0 | costCand = costCand + m_rdCost.getCost(bitsCand); |
1952 | 0 | if (costCand < outCost) |
1953 | 0 | { |
1954 | 0 | outCost = costCand; |
1955 | 0 | m.bits = bitsCand; |
1956 | 0 | m.index = mergeCand; |
1957 | 0 | } |
1958 | 0 | } |
1959 | |
|
1960 | 0 | m.mvField[0] = candMvField[m.index][0]; |
1961 | 0 | m.mvField[1] = candMvField[m.index][1]; |
1962 | 0 | m.dir = candDir[m.index]; |
1963 | |
|
1964 | 0 | return outCost; |
1965 | 0 | } |
1966 | | |
1967 | | /* find the lowres motion vector from lookahead in middle of current PU */ |
1968 | | MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref) |
1969 | 0 | { |
1970 | 0 | int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]); |
1971 | 0 | if (diffPoc > m_param->bframes + 1) |
1972 | | /* poc difference is out of range for lookahead */ |
1973 | 0 | return 0; |
1974 | | |
1975 | 0 | MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc]; |
1976 | 0 | if (mvs[0].x == 0x7FFF) |
1977 | | /* this motion search was not estimated by lookahead */ |
1978 | 0 | return 0; |
1979 | | |
1980 | 0 | uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4; |
1981 | 0 | uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4; |
1982 | 0 | uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x; |
1983 | |
|
1984 | 0 | X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n"); |
1985 | 0 | X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n"); |
1986 | |
|
1987 | 0 | return mvs[idx] << 1; /* scale up lowres mv */ |
1988 | 0 | } |
1989 | | |
1990 | | /* Pick between the two AMVP candidates which is the best one to use as |
1991 | | * MVP for the motion search, based on SAD cost */ |
1992 | | int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref) |
1993 | 0 | { |
1994 | 0 | if (amvp[0] == amvp[1]) |
1995 | 0 | return 0; |
1996 | | |
1997 | 0 | Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv; |
1998 | 0 | uint32_t costs[AMVP_NUM_CANDS]; |
1999 | |
|
2000 | 0 | for (int i = 0; i < AMVP_NUM_CANDS; i++) |
2001 | 0 | { |
2002 | 0 | MV mvCand = amvp[i]; |
2003 | | |
2004 | | // NOTE: skip mvCand if Y is > merange and -FN>1 |
2005 | 0 | if (m_bFrameParallel) |
2006 | 0 | { |
2007 | 0 | costs[i] = m_me.COST_MAX; |
2008 | |
|
2009 | 0 | if (mvCand.y >= (m_param->searchRange + 1) * 4) |
2010 | 0 | continue; |
2011 | | |
2012 | 0 | if ((m_param->maxSlices > 1) & |
2013 | 0 | ((mvCand.y < m_sliceMinY) |
2014 | 0 | | (mvCand.y > m_sliceMaxY))) |
2015 | 0 | continue; |
2016 | 0 | } |
2017 | 0 | cu.clipMv(mvCand); |
2018 | 0 | predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand); |
2019 | 0 | costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size); |
2020 | 0 | } |
2021 | |
|
2022 | 0 | return (costs[0] <= costs[1]) ? 0 : 1; |
2023 | 0 | } |
2024 | | |
2025 | | void Search::PME::processTasks(int workerThreadId) |
2026 | 0 | { |
2027 | | #if DETAILED_CU_STATS |
2028 | | int fe = mode.cu.m_encData->m_frameEncoderID; |
2029 | | master.m_stats[fe].countPMETasks++; |
2030 | | ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime); |
2031 | | #endif |
2032 | 0 | ProfileScopeEvent(pme); |
2033 | 0 | master.processPME(*this, master.m_tld[workerThreadId].analysis); |
2034 | 0 | } |
2035 | | |
2036 | | void Search::processPME(PME& pme, Search& slave) |
2037 | 0 | { |
2038 | | /* acquire a motion estimation job, else exit early */ |
2039 | 0 | int meId; |
2040 | 0 | pme.m_lock.acquire(); |
2041 | 0 | if (pme.m_jobTotal > pme.m_jobAcquired) |
2042 | 0 | { |
2043 | 0 | meId = pme.m_jobAcquired++; |
2044 | 0 | pme.m_lock.release(); |
2045 | 0 | } |
2046 | 0 | else |
2047 | 0 | { |
2048 | 0 | pme.m_lock.release(); |
2049 | 0 | return; |
2050 | 0 | } |
2051 | | |
2052 | | /* Setup slave Search instance for ME for master's CU */ |
2053 | 0 | if (&slave != this) |
2054 | 0 | { |
2055 | 0 | slave.m_slice = m_slice; |
2056 | 0 | slave.m_frame = m_frame; |
2057 | 0 | slave.m_param = m_param; |
2058 | 0 | slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp); |
2059 | 0 | bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400; |
2060 | 0 | slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma); |
2061 | 0 | } |
2062 | | |
2063 | | /* Perform ME, repeat until no more work is available */ |
2064 | 0 | do |
2065 | 0 | { |
2066 | 0 | if (meId < pme.m_jobs.refCnt[0]) |
2067 | 0 | { |
2068 | 0 | int refIdx = pme.m_jobs.ref[0][meId]; //L0 |
2069 | 0 | slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx); |
2070 | 0 | } |
2071 | 0 | else |
2072 | 0 | { |
2073 | 0 | int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1 |
2074 | 0 | slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx); |
2075 | 0 | } |
2076 | |
|
2077 | 0 | meId = -1; |
2078 | 0 | pme.m_lock.acquire(); |
2079 | 0 | if (pme.m_jobTotal > pme.m_jobAcquired) |
2080 | 0 | meId = pme.m_jobAcquired++; |
2081 | 0 | pme.m_lock.release(); |
2082 | 0 | } |
2083 | 0 | while (meId >= 0); |
2084 | 0 | } |
2085 | | |
2086 | | void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref) |
2087 | 0 | { |
2088 | 0 | uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS; |
2089 | 0 | bits += getTUBits(ref, m_slice->m_numRefIdx[list]); |
2090 | |
|
2091 | 0 | MotionData* bestME = interMode.bestME[part]; |
2092 | | |
2093 | | // 12 mv candidates including lowresMV |
2094 | 0 | MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; |
2095 | 0 | int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); |
2096 | |
|
2097 | 0 | const MV* amvp = interMode.amvpCand[list][ref]; |
2098 | 0 | int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref); |
2099 | 0 | bool bLowresMVP = false; |
2100 | 0 | MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres; |
2101 | |
|
2102 | 0 | if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */ |
2103 | 0 | { |
2104 | 0 | MV lmv = getLowresMV(interMode.cu, pu, list, ref); |
2105 | 0 | if (lmv.notZero()) |
2106 | 0 | mvc[numMvc++] = lmv; |
2107 | 0 | if (m_param->bEnableHME) |
2108 | 0 | mvp_lowres = lmv; |
2109 | 0 | } |
2110 | |
|
2111 | 0 | setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax); |
2112 | |
|
2113 | 0 | int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, |
2114 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2115 | |
|
2116 | 0 | if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp) |
2117 | 0 | { |
2118 | 0 | MV outmv_lowres; |
2119 | 0 | setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax); |
2120 | 0 | int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, |
2121 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2122 | 0 | if (lowresMvCost < satdCost) |
2123 | 0 | { |
2124 | 0 | outmv = outmv_lowres; |
2125 | 0 | satdCost = lowresMvCost; |
2126 | 0 | bLowresMVP = true; |
2127 | 0 | } |
2128 | 0 | } |
2129 | | /* Get total cost of partition, but only include MV bit cost once */ |
2130 | 0 | bits += m_me.bitcost(outmv); |
2131 | 0 | uint32_t mvCost = m_me.mvcost(outmv); |
2132 | 0 | uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); |
2133 | | |
2134 | | /* Update LowresMVP to best AMVP cand*/ |
2135 | 0 | if (bLowresMVP) |
2136 | 0 | updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres); |
2137 | | |
2138 | | /* Refine MVP selection, updates: mvpIdx, bits, cost */ |
2139 | 0 | mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); |
2140 | | |
2141 | | /* tie goes to the smallest ref ID, just like --no-pme */ |
2142 | 0 | ScopedLock _lock(master.m_meLock); |
2143 | 0 | if (cost < bestME[list].cost || |
2144 | 0 | (cost == bestME[list].cost && ref < bestME[list].ref)) |
2145 | 0 | { |
2146 | 0 | bestME[list].mv = outmv; |
2147 | 0 | bestME[list].mvp = mvp; |
2148 | 0 | bestME[list].mvpIdx = mvpIdx; |
2149 | 0 | bestME[list].ref = ref; |
2150 | 0 | bestME[list].cost = cost; |
2151 | 0 | bestME[list].bits = bits; |
2152 | 0 | bestME[list].mvCost = mvCost; |
2153 | 0 | } |
2154 | 0 | } |
2155 | | void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc) |
2156 | 0 | { |
2157 | 0 | CUData& cu = interMode.cu; |
2158 | 0 | MV mv, mvmin, mvmax; |
2159 | 0 | int cand = 0, bestcost = INT_MAX; |
2160 | 0 | while (cand < m_param->mvRefine) |
2161 | 0 | { |
2162 | 0 | if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1]))) |
2163 | 0 | { |
2164 | 0 | cand++; |
2165 | 0 | continue; |
2166 | 0 | } |
2167 | 0 | MV bestMV; |
2168 | 0 | mv = mvp[cand++]; |
2169 | 0 | cu.clipMv(mv); |
2170 | 0 | setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax); |
2171 | 0 | int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices, |
2172 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2173 | 0 | if (bestcost > cost) |
2174 | 0 | { |
2175 | 0 | bestcost = cost; |
2176 | 0 | outmv = bestMV; |
2177 | 0 | } |
2178 | 0 | } |
2179 | 0 | } |
2180 | | /* find the best inter prediction for each PU of specified mode */ |
2181 | | void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2]) |
2182 | 0 | { |
2183 | 0 | ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate); |
2184 | |
|
2185 | 0 | CUData& cu = interMode.cu; |
2186 | 0 | Yuv* predYuv = &interMode.predYuv; |
2187 | | |
2188 | | // 12 mv candidates including lowresMV |
2189 | 0 | MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; |
2190 | |
|
2191 | 0 | const Slice *slice = m_slice; |
2192 | 0 | int numPart = cu.getNumPartInter(0); |
2193 | 0 | int numPredDir = slice->isInterP() ? 1 : 2; |
2194 | 0 | const int* numRefIdx = slice->m_numRefIdx; |
2195 | 0 | uint32_t lastMode = 0; |
2196 | 0 | int totalmebits = 0; |
2197 | 0 | MV mvzero(0, 0); |
2198 | 0 | Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; |
2199 | 0 | MergeData merge; |
2200 | 0 | memset(&merge, 0, sizeof(merge)); |
2201 | 0 | bool useAsMVP = false; |
2202 | 0 | for (int puIdx = 0; puIdx < numPart; puIdx++) |
2203 | 0 | { |
2204 | 0 | MotionData* bestME = interMode.bestME[puIdx]; |
2205 | 0 | PredictionUnit pu(cu, cuGeom, puIdx); |
2206 | 0 | m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC); |
2207 | 0 | useAsMVP = false; |
2208 | 0 | x265_analysis_inter_data* interDataCTU = NULL; |
2209 | 0 | int cuIdx; |
2210 | 0 | cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx; |
2211 | 0 | if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1) |
2212 | 0 | { |
2213 | 0 | interDataCTU = m_frame->m_analysisData.interData; |
2214 | 0 | if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx]) |
2215 | 0 | && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx]) |
2216 | 0 | && !(interDataCTU->mergeFlag[cuIdx + puIdx]) |
2217 | 0 | && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx])) |
2218 | 0 | useAsMVP = true; |
2219 | 0 | } |
2220 | | /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */ |
2221 | 0 | uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge); |
2222 | 0 | bestME[0].cost = MAX_UINT; |
2223 | 0 | bestME[1].cost = MAX_UINT; |
2224 | |
|
2225 | 0 | getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); |
2226 | 0 | bool bDoUnidir = true; |
2227 | |
|
2228 | 0 | cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours); |
2229 | | /* Uni-directional prediction */ |
2230 | 0 | if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10) |
2231 | 0 | || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP)) |
2232 | 0 | { |
2233 | 0 | for (int list = 0; list < numPredDir; list++) |
2234 | 0 | { |
2235 | |
|
2236 | 0 | int ref = -1; |
2237 | 0 | if (useAsMVP) |
2238 | 0 | ref = interDataCTU->refIdx[list][cuIdx + puIdx]; |
2239 | 0 | else |
2240 | 0 | ref = bestME[list].ref; |
2241 | 0 | if (ref < 0) |
2242 | 0 | { |
2243 | 0 | continue; |
2244 | 0 | } |
2245 | 0 | uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS; |
2246 | 0 | bits += getTUBits(ref, numRefIdx[list]); |
2247 | |
|
2248 | 0 | int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); |
2249 | 0 | const MV* amvp = interMode.amvpCand[list][ref]; |
2250 | 0 | int mvpIdx = selectMVP(cu, pu, amvp, list, ref); |
2251 | 0 | MV mvmin, mvmax, outmv, mvp; |
2252 | 0 | if (useAsMVP) |
2253 | 0 | { |
2254 | 0 | mvp = interDataCTU->mv[list][cuIdx + puIdx].word; |
2255 | 0 | mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx]; |
2256 | 0 | } |
2257 | 0 | else |
2258 | 0 | mvp = amvp[mvpIdx]; |
2259 | 0 | if (m_param->searchMethod == X265_SEA) |
2260 | 0 | { |
2261 | 0 | int puX = puIdx & 1; |
2262 | 0 | int puY = puIdx >> 1; |
2263 | 0 | for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) |
2264 | 0 | m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride; |
2265 | 0 | } |
2266 | 0 | setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax); |
2267 | 0 | MV mvpIn = mvp; |
2268 | 0 | int satdCost; |
2269 | 0 | if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx) |
2270 | 0 | mvpIn = bestME[list].mv; |
2271 | 0 | if (useAsMVP && m_param->mvRefine > 1) |
2272 | 0 | { |
2273 | 0 | MV bestmv, mvpSel[3]; |
2274 | 0 | int mvpIdxSel[3]; |
2275 | 0 | satdCost = m_me.COST_MAX; |
2276 | 0 | mvpSel[0] = mvp; |
2277 | 0 | mvpIdxSel[0] = mvpIdx; |
2278 | 0 | mvpIdx = selectMVP(cu, pu, amvp, list, ref); |
2279 | 0 | mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx]; |
2280 | 0 | mvpIdxSel[1] = mvpIdx; |
2281 | 0 | if (m_param->mvRefine > 2) |
2282 | 0 | { |
2283 | 0 | mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx]; |
2284 | 0 | mvpIdxSel[2] = !mvpIdx; |
2285 | 0 | } |
2286 | 0 | for (int cand = 0; cand < m_param->mvRefine; cand++) |
2287 | 0 | { |
2288 | 0 | if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2]))) |
2289 | 0 | continue; |
2290 | 0 | setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax); |
2291 | 0 | int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices, |
2292 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2293 | 0 | if (satdCost > bcost) |
2294 | 0 | { |
2295 | 0 | satdCost = bcost; |
2296 | 0 | outmv = bestmv; |
2297 | 0 | mvp = mvpSel[cand]; |
2298 | 0 | mvpIdx = mvpIdxSel[cand]; |
2299 | 0 | } |
2300 | 0 | } |
2301 | 0 | mvpIn = mvp; |
2302 | 0 | } |
2303 | 0 | else |
2304 | 0 | { |
2305 | 0 | satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, |
2306 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2307 | 0 | } |
2308 | | |
2309 | | /* Get total cost of partition, but only include MV bit cost once */ |
2310 | 0 | bits += m_me.bitcost(outmv); |
2311 | 0 | uint32_t mvCost = m_me.mvcost(outmv); |
2312 | 0 | uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); |
2313 | | /* Refine MVP selection, updates: mvpIdx, bits, cost */ |
2314 | 0 | if (!(m_param->analysisMultiPassRefine || useAsMVP)) |
2315 | 0 | mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); |
2316 | 0 | else |
2317 | 0 | { |
2318 | | /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here |
2319 | | the actual mvp is bestME from pass 1 for that mvpIdx */ |
2320 | 0 | int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn); |
2321 | 0 | if (diffBits < 0) |
2322 | 0 | { |
2323 | 0 | mvpIdx = !mvpIdx; |
2324 | 0 | uint32_t origOutBits = bits; |
2325 | 0 | bits = origOutBits + diffBits; |
2326 | 0 | cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits); |
2327 | 0 | } |
2328 | 0 | mvp = amvp[mvpIdx]; |
2329 | 0 | } |
2330 | |
|
2331 | 0 | if (cost < bestME[list].cost) |
2332 | 0 | { |
2333 | 0 | bestME[list].mv = outmv; |
2334 | 0 | bestME[list].mvp = mvp; |
2335 | 0 | bestME[list].mvpIdx = mvpIdx; |
2336 | 0 | bestME[list].cost = cost; |
2337 | 0 | bestME[list].bits = bits; |
2338 | 0 | bestME[list].mvCost = mvCost; |
2339 | 0 | bestME[list].ref = ref; |
2340 | 0 | } |
2341 | 0 | bDoUnidir = false; |
2342 | 0 | } |
2343 | 0 | } |
2344 | 0 | else if (m_param->bDistributeMotionEstimation) |
2345 | 0 | { |
2346 | 0 | PME pme(*this, interMode, cuGeom, pu, puIdx); |
2347 | 0 | pme.m_jobTotal = 0; |
2348 | 0 | pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */ |
2349 | |
|
2350 | 0 | uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1; |
2351 | 0 | for (int list = 0; list < numPredDir; list++) |
2352 | 0 | { |
2353 | 0 | int idx = 0; |
2354 | 0 | for (int ref = 0; ref < numRefIdx[list]; ref++) |
2355 | 0 | { |
2356 | 0 | if (!(refMask & (1 << ref))) |
2357 | 0 | continue; |
2358 | | |
2359 | 0 | pme.m_jobs.ref[list][idx++] = ref; |
2360 | 0 | pme.m_jobTotal++; |
2361 | 0 | } |
2362 | 0 | pme.m_jobs.refCnt[list] = idx; |
2363 | | |
2364 | | /* the second list ref bits start at bit 16 */ |
2365 | 0 | refMask >>= 16; |
2366 | 0 | } |
2367 | |
|
2368 | 0 | if (pme.m_jobTotal > 2) |
2369 | 0 | { |
2370 | 0 | pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1); |
2371 | |
|
2372 | 0 | processPME(pme, *this); |
2373 | |
|
2374 | 0 | int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0]; |
2375 | 0 | singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */ |
2376 | |
|
2377 | 0 | bDoUnidir = false; |
2378 | |
|
2379 | 0 | ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters); |
2380 | 0 | pme.waitForExit(); |
2381 | 0 | } |
2382 | | |
2383 | | /* if no peer threads were bonded, fall back to doing unidirectional |
2384 | | * searches ourselves without overhead of singleMotionEstimation() */ |
2385 | 0 | } |
2386 | 0 | if (bDoUnidir) |
2387 | 0 | { |
2388 | 0 | interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1; |
2389 | 0 | uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1; |
2390 | |
|
2391 | 0 | for (int list = 0; list < numPredDir; list++) |
2392 | 0 | { |
2393 | 0 | for (int ref = 0; ref < numRefIdx[list]; ref++) |
2394 | 0 | { |
2395 | 0 | ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]); |
2396 | |
|
2397 | 0 | if (!(refMask & (1 << ref))) |
2398 | 0 | { |
2399 | 0 | ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]); |
2400 | 0 | continue; |
2401 | 0 | } |
2402 | | |
2403 | 0 | uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS; |
2404 | 0 | bits += getTUBits(ref, numRefIdx[list]); |
2405 | |
|
2406 | 0 | int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); |
2407 | |
|
2408 | 0 | const MV* amvp = interMode.amvpCand[list][ref]; |
2409 | 0 | int mvpIdx = selectMVP(cu, pu, amvp, list, ref); |
2410 | 0 | MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres; |
2411 | 0 | bool bLowresMVP = false; |
2412 | |
|
2413 | 0 | if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */ |
2414 | 0 | { |
2415 | 0 | MV lmv = getLowresMV(cu, pu, list, ref); |
2416 | 0 | if (lmv.notZero()) |
2417 | 0 | mvc[numMvc++] = lmv; |
2418 | 0 | if (m_param->bEnableHME) |
2419 | 0 | mvp_lowres = lmv; |
2420 | 0 | } |
2421 | 0 | if (m_param->searchMethod == X265_SEA) |
2422 | 0 | { |
2423 | 0 | int puX = puIdx & 1; |
2424 | 0 | int puY = puIdx >> 1; |
2425 | 0 | for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) |
2426 | 0 | m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride; |
2427 | 0 | } |
2428 | 0 | setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax); |
2429 | 0 | int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, |
2430 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2431 | |
|
2432 | 0 | if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp) |
2433 | 0 | { |
2434 | 0 | MV outmv_lowres; |
2435 | 0 | setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax); |
2436 | 0 | int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, |
2437 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2438 | 0 | if (lowresMvCost < satdCost) |
2439 | 0 | { |
2440 | 0 | outmv = outmv_lowres; |
2441 | 0 | satdCost = lowresMvCost; |
2442 | 0 | bLowresMVP = true; |
2443 | 0 | } |
2444 | 0 | } |
2445 | | |
2446 | | /* Get total cost of partition, but only include MV bit cost once */ |
2447 | 0 | bits += m_me.bitcost(outmv); |
2448 | 0 | uint32_t mvCost = m_me.mvcost(outmv); |
2449 | 0 | uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); |
2450 | | /* Update LowresMVP to best AMVP cand*/ |
2451 | 0 | if (bLowresMVP) |
2452 | 0 | updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres); |
2453 | | |
2454 | | /* Refine MVP selection, updates: mvpIdx, bits, cost */ |
2455 | 0 | mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); |
2456 | |
|
2457 | 0 | if (cost < bestME[list].cost) |
2458 | 0 | { |
2459 | 0 | bestME[list].mv = outmv; |
2460 | 0 | bestME[list].mvp = mvp; |
2461 | 0 | bestME[list].mvpIdx = mvpIdx; |
2462 | 0 | bestME[list].ref = ref; |
2463 | 0 | bestME[list].cost = cost; |
2464 | 0 | bestME[list].bits = bits; |
2465 | 0 | bestME[list].mvCost = mvCost; |
2466 | 0 | } |
2467 | 0 | } |
2468 | | /* the second list ref bits start at bit 16 */ |
2469 | 0 | refMask >>= 16; |
2470 | 0 | } |
2471 | 0 | } |
2472 | | |
2473 | | /* Bi-directional prediction */ |
2474 | 0 | MotionData bidir[2]; |
2475 | 0 | uint32_t bidirCost = MAX_UINT; |
2476 | 0 | int bidirBits = 0; |
2477 | |
|
2478 | 0 | if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */ |
2479 | 0 | cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ |
2480 | 0 | bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT) |
2481 | 0 | { |
2482 | 0 | bidir[0] = bestME[0]; |
2483 | 0 | bidir[1] = bestME[1]; |
2484 | |
|
2485 | 0 | int satdCost; |
2486 | |
|
2487 | 0 | if (m_me.bChromaSATD) |
2488 | 0 | { |
2489 | 0 | cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv; |
2490 | 0 | cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref; |
2491 | 0 | cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv; |
2492 | 0 | cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref; |
2493 | 0 | motionCompensation(cu, pu, tmpPredYuv, true, true); |
2494 | |
|
2495 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) + |
2496 | 0 | m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx); |
2497 | 0 | } |
2498 | 0 | else |
2499 | 0 | { |
2500 | 0 | PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref]; |
2501 | 0 | PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref]; |
2502 | 0 | Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; |
2503 | | |
2504 | | /* Generate reference subpels */ |
2505 | 0 | predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv); |
2506 | 0 | predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv); |
2507 | 0 | primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size, |
2508 | 0 | bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32); |
2509 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); |
2510 | 0 | } |
2511 | |
|
2512 | 0 | bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); |
2513 | 0 | bidirCost = satdCost + m_rdCost.getCost(bidirBits); |
2514 | |
|
2515 | 0 | bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); |
2516 | 0 | if (bTryZero) |
2517 | 0 | { |
2518 | | /* Do not try zero MV if unidir motion predictors are beyond |
2519 | | * valid search area */ |
2520 | 0 | MV mvmin, mvmax; |
2521 | 0 | int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight); |
2522 | 0 | setSearchRange(cu, mvzero, merange, mvmin, mvmax); |
2523 | 0 | mvmax.y += 2; // there is some pad for subpel refine |
2524 | 0 | mvmin <<= 2; |
2525 | 0 | mvmax <<= 2; |
2526 | |
|
2527 | 0 | bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); |
2528 | 0 | bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); |
2529 | 0 | } |
2530 | 0 | if (bTryZero) |
2531 | 0 | { |
2532 | | /* coincident blocks of the two reference pictures */ |
2533 | 0 | if (m_me.bChromaSATD) |
2534 | 0 | { |
2535 | 0 | cu.m_mv[0][pu.puAbsPartIdx] = mvzero; |
2536 | 0 | cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref; |
2537 | 0 | cu.m_mv[1][pu.puAbsPartIdx] = mvzero; |
2538 | 0 | cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref; |
2539 | 0 | motionCompensation(cu, pu, tmpPredYuv, true, true); |
2540 | |
|
2541 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) + |
2542 | 0 | m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx); |
2543 | 0 | } |
2544 | 0 | else |
2545 | 0 | { |
2546 | 0 | const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx); |
2547 | 0 | const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx); |
2548 | 0 | intptr_t refStride = slice->m_mref[0][0].lumaStride; |
2549 | 0 | primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); |
2550 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); |
2551 | 0 | } |
2552 | 0 | MV mvp0 = bestME[0].mvp; |
2553 | 0 | int mvpIdx0 = bestME[0].mvpIdx; |
2554 | 0 | uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); |
2555 | |
|
2556 | 0 | MV mvp1 = bestME[1].mvp; |
2557 | 0 | int mvpIdx1 = bestME[1].mvpIdx; |
2558 | 0 | uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); |
2559 | |
|
2560 | 0 | uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); |
2561 | | |
2562 | | /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ |
2563 | 0 | mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost); |
2564 | 0 | mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost); |
2565 | |
|
2566 | 0 | if (cost < bidirCost) |
2567 | 0 | { |
2568 | 0 | bidir[0].mv = mvzero; |
2569 | 0 | bidir[1].mv = mvzero; |
2570 | 0 | bidir[0].mvp = mvp0; |
2571 | 0 | bidir[1].mvp = mvp1; |
2572 | 0 | bidir[0].mvpIdx = mvpIdx0; |
2573 | 0 | bidir[1].mvpIdx = mvpIdx1; |
2574 | 0 | bidirCost = cost; |
2575 | 0 | bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); |
2576 | 0 | } |
2577 | 0 | } |
2578 | 0 | } |
2579 | | |
2580 | | /* select best option and store into CU */ |
2581 | 0 | if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) |
2582 | 0 | { |
2583 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = true; |
2584 | 0 | cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */ |
2585 | 0 | cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx); |
2586 | 0 | cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx); |
2587 | 0 | cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx); |
2588 | 0 | cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx); |
2589 | 0 | cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx); |
2590 | |
|
2591 | 0 | totalmebits += merge.bits; |
2592 | 0 | } |
2593 | 0 | else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) |
2594 | 0 | { |
2595 | 0 | lastMode = 2; |
2596 | |
|
2597 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
2598 | 0 | cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx); |
2599 | 0 | cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx); |
2600 | 0 | cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx); |
2601 | 0 | cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; |
2602 | 0 | cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx; |
2603 | |
|
2604 | 0 | cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx); |
2605 | 0 | cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx); |
2606 | 0 | cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; |
2607 | 0 | cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx; |
2608 | |
|
2609 | 0 | totalmebits += bidirBits; |
2610 | 0 | } |
2611 | 0 | else if (bestME[0].cost <= bestME[1].cost) |
2612 | 0 | { |
2613 | 0 | lastMode = 0; |
2614 | |
|
2615 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
2616 | 0 | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); |
2617 | 0 | cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx); |
2618 | 0 | cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx); |
2619 | 0 | cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp; |
2620 | 0 | cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx; |
2621 | |
|
2622 | 0 | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); |
2623 | 0 | cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx); |
2624 | |
|
2625 | 0 | totalmebits += bestME[0].bits; |
2626 | 0 | } |
2627 | 0 | else |
2628 | 0 | { |
2629 | 0 | lastMode = 1; |
2630 | |
|
2631 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
2632 | 0 | cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx); |
2633 | 0 | cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx); |
2634 | 0 | cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx); |
2635 | 0 | cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp; |
2636 | 0 | cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx; |
2637 | |
|
2638 | 0 | cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); |
2639 | 0 | cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx); |
2640 | |
|
2641 | 0 | totalmebits += bestME[1].bits; |
2642 | 0 | } |
2643 | |
|
2644 | 0 | motionCompensation(cu, pu, *predYuv, true, bChromaMC); |
2645 | 0 | } |
2646 | 0 | interMode.sa8dBits += totalmebits; |
2647 | 0 | } |
2648 | | |
2649 | | void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]) |
2650 | 0 | { |
2651 | 0 | if (cuMode == SIZE_2Nx2N) |
2652 | 0 | { |
2653 | 0 | blockBit[0] = (!bPSlice) ? 3 : 1; |
2654 | 0 | blockBit[1] = 3; |
2655 | 0 | blockBit[2] = 5; |
2656 | 0 | } |
2657 | 0 | else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD) |
2658 | 0 | { |
2659 | 0 | static const uint32_t listBits[2][3][3] = |
2660 | 0 | { |
2661 | 0 | { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } }, |
2662 | 0 | { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } } |
2663 | 0 | }; |
2664 | 0 | if (bPSlice) |
2665 | 0 | { |
2666 | 0 | blockBit[0] = 3; |
2667 | 0 | blockBit[1] = 0; |
2668 | 0 | blockBit[2] = 0; |
2669 | 0 | } |
2670 | 0 | else |
2671 | 0 | memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t)); |
2672 | 0 | } |
2673 | 0 | else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N) |
2674 | 0 | { |
2675 | 0 | static const uint32_t listBits[2][3][3] = |
2676 | 0 | { |
2677 | 0 | { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } }, |
2678 | 0 | { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } } |
2679 | 0 | }; |
2680 | 0 | if (bPSlice) |
2681 | 0 | { |
2682 | 0 | blockBit[0] = 3; |
2683 | 0 | blockBit[1] = 0; |
2684 | 0 | blockBit[2] = 0; |
2685 | 0 | } |
2686 | 0 | else |
2687 | 0 | memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t)); |
2688 | 0 | } |
2689 | 0 | else if (cuMode == SIZE_NxN) |
2690 | 0 | { |
2691 | 0 | blockBit[0] = (!bPSlice) ? 3 : 1; |
2692 | 0 | blockBit[1] = 3; |
2693 | 0 | blockBit[2] = 5; |
2694 | 0 | } |
2695 | 0 | else |
2696 | 0 | { |
2697 | 0 | X265_CHECK(0, "getBlkBits: unknown cuMode\n"); |
2698 | 0 | } |
2699 | 0 | } |
2700 | | |
2701 | | /* Check if using an alternative MVP would result in a smaller MVD + signal bits */ |
2702 | | const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const |
2703 | 0 | { |
2704 | 0 | int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]); |
2705 | 0 | if (diffBits < 0) |
2706 | 0 | { |
2707 | 0 | mvpIdx = !mvpIdx; |
2708 | 0 | uint32_t origOutBits = outBits; |
2709 | 0 | outBits = origOutBits + diffBits; |
2710 | 0 | outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits); |
2711 | 0 | } |
2712 | 0 | return amvpCand[mvpIdx]; |
2713 | 0 | } |
2714 | | |
2715 | | /* Update to default MVP when using an alternative mvp */ |
2716 | | void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP) |
2717 | 0 | { |
2718 | 0 | int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP); |
2719 | 0 | uint32_t origOutBits = outBits; |
2720 | 0 | outBits = origOutBits + diffBits; |
2721 | 0 | outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits); |
2722 | 0 | } |
2723 | | |
2724 | | void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const |
2725 | 0 | { |
2726 | 0 | MV dist((int32_t)merange << 2, (int32_t)merange << 2); |
2727 | 0 | mvmin = mvp - dist; |
2728 | 0 | mvmax = mvp + dist; |
2729 | |
|
2730 | 0 | cu.clipMv(mvmin); |
2731 | 0 | cu.clipMv(mvmax); |
2732 | |
|
2733 | 0 | if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && |
2734 | 0 | cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol && |
2735 | 0 | m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth) |
2736 | 0 | { |
2737 | 0 | int safeX, maxSafeMv; |
2738 | 0 | safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; |
2739 | 0 | maxSafeMv = (safeX - cu.m_cuPelX) * 4; |
2740 | 0 | mvmax.x = X265_MIN(mvmax.x, maxSafeMv); |
2741 | 0 | mvmin.x = X265_MIN(mvmin.x, maxSafeMv); |
2742 | 0 | } |
2743 | | |
2744 | | // apply restrict on slices |
2745 | 0 | if ((m_param->maxSlices > 1) & m_bFrameParallel) |
2746 | 0 | { |
2747 | 0 | mvmin.y = X265_MAX(mvmin.y, m_sliceMinY); |
2748 | 0 | mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY); |
2749 | 0 | } |
2750 | | |
2751 | | /* Clip search range to signaled maximum MV length. |
2752 | | * We do not support this VUI field being changed from the default */ |
2753 | 0 | const int maxMvLen = (1 << 15) - 1; |
2754 | 0 | mvmin.x = X265_MAX(mvmin.x, -maxMvLen); |
2755 | 0 | mvmin.y = X265_MAX(mvmin.y, -maxMvLen); |
2756 | 0 | mvmax.x = X265_MIN(mvmax.x, maxMvLen); |
2757 | 0 | mvmax.y = X265_MIN(mvmax.y, maxMvLen); |
2758 | |
|
2759 | 0 | mvmin >>= 2; |
2760 | 0 | mvmax >>= 2; |
2761 | | |
2762 | | /* conditional clipping for frame parallelism */ |
2763 | 0 | mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels); |
2764 | 0 | mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels); |
2765 | | |
2766 | | /* conditional clipping for negative mv range */ |
2767 | 0 | mvmax.y = X265_MAX(mvmax.y, mvmin.y); |
2768 | 0 | } |
2769 | | |
2770 | | /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ |
2771 | | void Search::encodeResAndCalcRdSkipCU(Mode& interMode) |
2772 | 0 | { |
2773 | 0 | CUData& cu = interMode.cu; |
2774 | 0 | Yuv* reconYuv = &interMode.reconYuv; |
2775 | 0 | const Yuv* fencYuv = interMode.fencYuv; |
2776 | 0 | Yuv* predYuv = &interMode.predYuv; |
2777 | 0 | X265_CHECK(!cu.isIntra(0), "intra CU not expected\n"); |
2778 | 0 | uint32_t depth = cu.m_cuDepth[0]; |
2779 | | |
2780 | | // No residual coding : SKIP mode |
2781 | |
|
2782 | 0 | cu.setPredModeSubParts(MODE_SKIP); |
2783 | 0 | cu.clearCbf(); |
2784 | 0 | cu.setTUDepthSubParts(0, 0, depth); |
2785 | |
|
2786 | 0 | reconYuv->copyFromYuv(interMode.predYuv); |
2787 | | |
2788 | | // Luma |
2789 | 0 | int part = partitionFromLog2Size(cu.m_log2CUSize[0]); |
2790 | 0 | interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
2791 | 0 | interMode.distortion = interMode.lumaDistortion; |
2792 | | // Chroma |
2793 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
2794 | 0 | { |
2795 | 0 | interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); |
2796 | 0 | interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); |
2797 | 0 | interMode.distortion += interMode.chromaDistortion; |
2798 | 0 | } |
2799 | 0 | cu.m_distortion[0] = interMode.distortion; |
2800 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
2801 | 0 | m_entropyCoder.resetBits(); |
2802 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
2803 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
2804 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
2805 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
2806 | 0 | m_entropyCoder.codeMergeIndex(cu, 0); |
2807 | 0 | interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
2808 | 0 | interMode.coeffBits = 0; |
2809 | 0 | interMode.totalBits = interMode.mvBits + skipFlagBits; |
2810 | 0 | if (m_rdCost.m_psyRd) |
2811 | 0 | interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
2812 | 0 | else if(m_rdCost.m_ssimRd) |
2813 | 0 | interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0); |
2814 | |
|
2815 | 0 | interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
2816 | 0 | updateModeCost(interMode); |
2817 | 0 | m_entropyCoder.store(interMode.contexts); |
2818 | 0 | } |
2819 | | |
2820 | | /* encode residual and calculate rate-distortion for a CU block. |
2821 | | * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ |
2822 | | void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) |
2823 | 0 | { |
2824 | 0 | ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); |
2825 | |
|
2826 | 0 | CUData& cu = interMode.cu; |
2827 | 0 | Yuv* reconYuv = &interMode.reconYuv; |
2828 | 0 | Yuv* predYuv = &interMode.predYuv; |
2829 | 0 | uint32_t depth = cuGeom.depth; |
2830 | 0 | ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv; |
2831 | 0 | const Yuv* fencYuv = interMode.fencYuv; |
2832 | |
|
2833 | 0 | X265_CHECK(!cu.isIntra(0), "intra CU not expected\n"); |
2834 | |
|
2835 | 0 | uint32_t log2CUSize = cuGeom.log2CUSize; |
2836 | 0 | int sizeIdx = log2CUSize - 2; |
2837 | |
|
2838 | 0 | resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp); |
2839 | |
|
2840 | 0 | uint32_t tuDepthRange[2]; |
2841 | 0 | cu.getInterTUQtDepthRange(tuDepthRange, 0); |
2842 | |
|
2843 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
2844 | |
|
2845 | 0 | if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH)) |
2846 | 0 | m_maxTUDepth = -1; |
2847 | 0 | else if (m_limitTU & X265_TU_LIMIT_BFS) |
2848 | 0 | memset(&m_cacheTU, 0, sizeof(TUInfoCache)); |
2849 | |
|
2850 | 0 | Cost costs; |
2851 | 0 | if (m_limitTU & X265_TU_LIMIT_NEIGH) |
2852 | 0 | { |
2853 | | /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */ |
2854 | 0 | int32_t tempDepth = m_maxTUDepth; |
2855 | 0 | if (m_maxTUDepth != -1) |
2856 | 0 | { |
2857 | 0 | uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N; |
2858 | 0 | uint32_t minSize = tuDepthRange[0]; |
2859 | 0 | uint32_t maxSize = tuDepthRange[1]; |
2860 | 0 | maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag); |
2861 | 0 | m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth); |
2862 | 0 | } |
2863 | 0 | estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange); |
2864 | 0 | m_maxTUDepth = tempDepth; |
2865 | 0 | } |
2866 | 0 | else |
2867 | 0 | estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange); |
2868 | |
|
2869 | 0 | uint32_t tqBypass = cu.m_tqBypass[0]; |
2870 | 0 | if (!tqBypass) |
2871 | 0 | { |
2872 | 0 | sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
2873 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
2874 | 0 | { |
2875 | 0 | cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); |
2876 | 0 | cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); |
2877 | 0 | } |
2878 | | |
2879 | | /* Consider the RD cost of not signaling any residual */ |
2880 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
2881 | 0 | m_entropyCoder.resetBits(); |
2882 | 0 | m_entropyCoder.codeQtRootCbfZero(); |
2883 | 0 | uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits(); |
2884 | |
|
2885 | 0 | uint32_t cbf0Energy; uint64_t cbf0Cost; |
2886 | 0 | if (m_rdCost.m_psyRd) |
2887 | 0 | { |
2888 | 0 | cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
2889 | 0 | cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy); |
2890 | 0 | } |
2891 | 0 | else if(m_rdCost.m_ssimRd) |
2892 | 0 | { |
2893 | 0 | cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0); |
2894 | 0 | cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy); |
2895 | 0 | } |
2896 | 0 | else |
2897 | 0 | cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits); |
2898 | |
|
2899 | 0 | if (cbf0Cost < costs.rdcost) |
2900 | 0 | { |
2901 | 0 | cu.clearCbf(); |
2902 | 0 | cu.setTUDepthSubParts(0, 0, depth); |
2903 | 0 | } |
2904 | 0 | } |
2905 | |
|
2906 | 0 | if (cu.getQtRootCbf(0)) |
2907 | 0 | saveResidualQTData(cu, *resiYuv, 0, 0); |
2908 | | |
2909 | | /* calculate signal bits for inter/merge/skip coded CU */ |
2910 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
2911 | |
|
2912 | 0 | m_entropyCoder.resetBits(); |
2913 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
2914 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(tqBypass); |
2915 | |
|
2916 | 0 | uint32_t coeffBits, bits, mvBits; |
2917 | 0 | if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) |
2918 | 0 | { |
2919 | 0 | cu.setPredModeSubParts(MODE_SKIP); |
2920 | | |
2921 | | /* Merge/Skip */ |
2922 | 0 | coeffBits = mvBits = 0; |
2923 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
2924 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
2925 | 0 | m_entropyCoder.codeMergeIndex(cu, 0); |
2926 | 0 | mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
2927 | 0 | bits = mvBits + skipFlagBits; |
2928 | 0 | } |
2929 | 0 | else |
2930 | 0 | { |
2931 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
2932 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
2933 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
2934 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
2935 | 0 | m_entropyCoder.codePredInfo(cu, 0); |
2936 | 0 | mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
2937 | |
|
2938 | 0 | bool bCodeDQP = m_slice->m_pps->bUseDQP; |
2939 | 0 | m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); |
2940 | 0 | bits = m_entropyCoder.getNumberOfWrittenBits(); |
2941 | |
|
2942 | 0 | coeffBits = bits - mvBits - skipFlagBits; |
2943 | 0 | } |
2944 | |
|
2945 | 0 | m_entropyCoder.store(interMode.contexts); |
2946 | |
|
2947 | 0 | if (cu.getQtRootCbf(0)) |
2948 | 0 | reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp); |
2949 | 0 | else |
2950 | 0 | reconYuv->copyFromYuv(*predYuv); |
2951 | | |
2952 | | // update with clipped distortion and cost (qp estimation loop uses unclipped values) |
2953 | 0 | sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
2954 | 0 | interMode.distortion = bestLumaDist; |
2955 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
2956 | 0 | { |
2957 | 0 | sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); |
2958 | 0 | bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); |
2959 | 0 | interMode.chromaDistortion = bestChromaDist; |
2960 | 0 | interMode.distortion += bestChromaDist; |
2961 | 0 | } |
2962 | 0 | if (m_rdCost.m_psyRd) |
2963 | 0 | interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
2964 | 0 | else if(m_rdCost.m_ssimRd) |
2965 | 0 | interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0); |
2966 | |
|
2967 | 0 | interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
2968 | 0 | interMode.totalBits = bits; |
2969 | 0 | interMode.lumaDistortion = bestLumaDist; |
2970 | 0 | interMode.coeffBits = coeffBits; |
2971 | 0 | interMode.mvBits = mvBits; |
2972 | 0 | cu.m_distortion[0] = interMode.distortion; |
2973 | 0 | updateModeCost(interMode); |
2974 | 0 | checkDQP(interMode, cuGeom); |
2975 | 0 | } |
2976 | | |
2977 | | void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]) |
2978 | 0 | { |
2979 | 0 | uint32_t depth = cuGeom.depth + tuDepth; |
2980 | 0 | CUData& cu = mode.cu; |
2981 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
2982 | |
|
2983 | 0 | bool bCheckFull = log2TrSize <= depthRange[1]; |
2984 | 0 | if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0]) |
2985 | 0 | bCheckFull = false; |
2986 | |
|
2987 | 0 | if (bCheckFull) |
2988 | 0 | { |
2989 | | // code full block |
2990 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
2991 | 0 | uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0; |
2992 | |
|
2993 | 0 | uint32_t tuDepthC = tuDepth; |
2994 | 0 | if (log2TrSizeC < 2) |
2995 | 0 | { |
2996 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
2997 | 0 | log2TrSizeC = 2; |
2998 | 0 | tuDepthC--; |
2999 | 0 | codeChroma &= !(absPartIdx & 3); |
3000 | 0 | } |
3001 | |
|
3002 | 0 | uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2; |
3003 | 0 | uint32_t setCbf = 1 << tuDepth; |
3004 | |
|
3005 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
3006 | 0 | coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY; |
3007 | |
|
3008 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
3009 | |
|
3010 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); |
3011 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); |
3012 | |
|
3013 | 0 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; |
3014 | 0 | const Yuv* fencYuv = mode.fencYuv; |
3015 | |
|
3016 | 0 | int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx); |
3017 | 0 | uint32_t strideResiY = resiYuv.m_size; |
3018 | |
|
3019 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
3020 | 0 | uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
3021 | |
|
3022 | 0 | if (numSigY) |
3023 | 0 | { |
3024 | 0 | m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY); |
3025 | 0 | cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth); |
3026 | 0 | } |
3027 | 0 | else |
3028 | 0 | { |
3029 | 0 | primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0); |
3030 | 0 | cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth); |
3031 | 0 | } |
3032 | |
|
3033 | 0 | if (codeChroma) |
3034 | 0 | { |
3035 | 0 | uint32_t sizeIdxC = log2TrSizeC - 2; |
3036 | 0 | uint32_t strideResiC = resiYuv.m_csize; |
3037 | |
|
3038 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
3039 | 0 | coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC; |
3040 | 0 | coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC; |
3041 | 0 | bool splitIntoSubTUs = (m_csp == X265_CSP_I422); |
3042 | |
|
3043 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
3044 | 0 | do |
3045 | 0 | { |
3046 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
3047 | 0 | uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); |
3048 | |
|
3049 | 0 | cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); |
3050 | 0 | cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); |
3051 | |
|
3052 | 0 | int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC); |
3053 | 0 | const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC); |
3054 | 0 | uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false); |
3055 | 0 | if (numSigU) |
3056 | 0 | { |
3057 | 0 | m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU); |
3058 | 0 | cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); |
3059 | 0 | } |
3060 | 0 | else |
3061 | 0 | { |
3062 | 0 | primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0); |
3063 | 0 | cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); |
3064 | 0 | } |
3065 | |
|
3066 | 0 | int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC); |
3067 | 0 | const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC); |
3068 | 0 | uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false); |
3069 | 0 | if (numSigV) |
3070 | 0 | { |
3071 | 0 | m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV); |
3072 | 0 | cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); |
3073 | 0 | } |
3074 | 0 | else |
3075 | 0 | { |
3076 | 0 | primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0); |
3077 | 0 | cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); |
3078 | 0 | } |
3079 | 0 | } |
3080 | 0 | while (tuIterator.isNextSection()); |
3081 | |
|
3082 | 0 | if (splitIntoSubTUs) |
3083 | 0 | { |
3084 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
3085 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
3086 | 0 | } |
3087 | 0 | } |
3088 | 0 | } |
3089 | 0 | else |
3090 | 0 | { |
3091 | 0 | X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n"); |
3092 | |
|
3093 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
3094 | 0 | uint32_t ycbf = 0, ucbf = 0, vcbf = 0; |
3095 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
3096 | 0 | { |
3097 | 0 | residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange); |
3098 | 0 | ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
3099 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
3100 | 0 | { |
3101 | 0 | ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
3102 | 0 | vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
3103 | 0 | } |
3104 | 0 | } |
3105 | 0 | cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth; |
3106 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
3107 | 0 | { |
3108 | 0 | cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth; |
3109 | 0 | cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth; |
3110 | 0 | } |
3111 | 0 | } |
3112 | 0 | } |
3113 | | |
3114 | | uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId) |
3115 | 0 | { |
3116 | 0 | uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth); |
3117 | |
|
3118 | 0 | if (m_rdCost.m_psyRd) |
3119 | 0 | return m_rdCost.calcPsyRdCost(dist, nullBits, energy); |
3120 | 0 | else if(m_rdCost.m_ssimRd) |
3121 | 0 | return m_rdCost.calcSsimRdCost(dist, nullBits, energy); |
3122 | 0 | else |
3123 | 0 | return m_rdCost.calcRdCost(dist, nullBits); |
3124 | 0 | } |
3125 | | |
3126 | | bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore) |
3127 | 0 | { |
3128 | 0 | CUData& cu = mode.cu; |
3129 | 0 | uint32_t depth = cuGeom.depth + tuDepth; |
3130 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
3131 | |
|
3132 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
3133 | 0 | uint32_t ycbf = 0, ucbf = 0, vcbf = 0; |
3134 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
3135 | 0 | { |
3136 | 0 | if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1) |
3137 | 0 | { |
3138 | 0 | m_maxTUDepth = cu.m_tuDepth[0]; |
3139 | | // Fetch maximum TU depth of first sub partition to limit recursion of others |
3140 | 0 | for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++) |
3141 | 0 | m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]); |
3142 | 0 | } |
3143 | 0 | estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore); |
3144 | 0 | ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
3145 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
3146 | 0 | { |
3147 | 0 | ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
3148 | 0 | vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
3149 | 0 | } |
3150 | 0 | } |
3151 | 0 | cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth; |
3152 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
3153 | 0 | { |
3154 | 0 | cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth; |
3155 | 0 | cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth; |
3156 | 0 | } |
3157 | | |
3158 | | // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits |
3159 | | // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma. |
3160 | | // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context |
3161 | | // at depth 0 (for example). |
3162 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
3163 | 0 | m_entropyCoder.resetBits(); |
3164 | 0 | codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange); |
3165 | 0 | uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits(); |
3166 | 0 | splitCost.bits += splitCbfBits; |
3167 | |
|
3168 | 0 | if (m_rdCost.m_psyRd) |
3169 | 0 | splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
3170 | 0 | else if(m_rdCost.m_ssimRd) |
3171 | 0 | splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
3172 | 0 | else |
3173 | 0 | splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits); |
3174 | | |
3175 | 0 | return ycbf || ucbf || vcbf; |
3176 | 0 | } |
3177 | | |
3178 | | void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore) |
3179 | 0 | { |
3180 | 0 | CUData& cu = mode.cu; |
3181 | 0 | uint32_t depth = cuGeom.depth + tuDepth; |
3182 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
3183 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
3184 | |
|
3185 | 0 | bool bCheckSplit = log2TrSize > depthRange[0]; |
3186 | 0 | bool bCheckFull = log2TrSize <= depthRange[1]; |
3187 | 0 | bool bSaveTUData = false, bLoadTUData = false; |
3188 | 0 | uint32_t idx = 0; |
3189 | |
|
3190 | 0 | if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0) |
3191 | 0 | { |
3192 | 0 | if (bCheckSplit && bCheckFull && tuDepth) |
3193 | 0 | { |
3194 | 0 | uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2; |
3195 | 0 | uint32_t qIdx = (absPartIdx / qNumParts) % 4; |
3196 | 0 | idx = (depth - 1) * 4 + qIdx; |
3197 | 0 | if (splitMore) |
3198 | 0 | { |
3199 | 0 | bLoadTUData = true; |
3200 | 0 | bCheckFull = false; |
3201 | 0 | } |
3202 | 0 | else |
3203 | 0 | { |
3204 | 0 | bSaveTUData = true; |
3205 | 0 | bCheckSplit = false; |
3206 | 0 | } |
3207 | 0 | } |
3208 | 0 | } |
3209 | 0 | else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH) |
3210 | 0 | { |
3211 | 0 | if (bCheckSplit && m_maxTUDepth >= 0) |
3212 | 0 | { |
3213 | 0 | uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth; |
3214 | 0 | bCheckSplit = log2TrSize > log2MaxTrSize; |
3215 | 0 | } |
3216 | 0 | } |
3217 | |
|
3218 | 0 | bool bSplitPresentFlag = bCheckSplit && bCheckFull; |
3219 | |
|
3220 | 0 | if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit) |
3221 | 0 | bCheckFull = false; |
3222 | |
|
3223 | 0 | X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n"); |
3224 | |
|
3225 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
3226 | 0 | uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0; |
3227 | 0 | uint32_t tuDepthC = tuDepth; |
3228 | 0 | if (log2TrSizeC < 2) |
3229 | 0 | { |
3230 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
3231 | 0 | log2TrSizeC = 2; |
3232 | 0 | tuDepthC--; |
3233 | 0 | codeChroma &= !(absPartIdx & 3); |
3234 | 0 | } |
3235 | | |
3236 | | // code full block |
3237 | 0 | Cost fullCost; |
3238 | 0 | fullCost.rdcost = MAX_INT64; |
3239 | |
|
3240 | 0 | uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; |
3241 | 0 | uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; |
3242 | 0 | uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
3243 | 0 | sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
3244 | 0 | uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
3245 | 0 | uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
3246 | 0 | uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} }; |
3247 | |
|
3248 | 0 | m_entropyCoder.store(m_rqt[depth].rqtRoot); |
3249 | |
|
3250 | 0 | uint32_t trSize = 1 << log2TrSize; |
3251 | 0 | const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); |
3252 | 0 | uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2; |
3253 | 0 | const Yuv* fencYuv = mode.fencYuv; |
3254 | | |
3255 | | // code full block |
3256 | 0 | if (bCheckFull) |
3257 | 0 | { |
3258 | 0 | uint32_t trSizeC = 1 << log2TrSizeC; |
3259 | 0 | int partSize = partitionFromLog2Size(log2TrSize); |
3260 | 0 | int partSizeC = partitionFromLog2Size(log2TrSizeC); |
3261 | 0 | const uint32_t qtLayer = log2TrSize - 2; |
3262 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
3263 | 0 | coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
3264 | |
|
3265 | 0 | bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0]; |
3266 | 0 | bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE; |
3267 | 0 | bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE; |
3268 | |
|
3269 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); |
3270 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); |
3271 | |
|
3272 | 0 | if (bEnableRDOQ) |
3273 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
3274 | |
|
3275 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
3276 | 0 | int16_t* resi = resiYuv.getLumaAddr(absPartIdx); |
3277 | 0 | numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
3278 | 0 | cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0]; |
3279 | |
|
3280 | 0 | m_entropyCoder.resetBits(); |
3281 | |
|
3282 | 0 | if (bSplitPresentFlag && log2TrSize > depthRange[0]) |
3283 | 0 | m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); |
3284 | |
|
3285 | 0 | if (cbfFlag[TEXT_LUMA][0]) |
3286 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); |
3287 | 0 | singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits(); |
3288 | |
|
3289 | 0 | X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n"); |
3290 | | |
3291 | | //Assuming zero residual |
3292 | 0 | sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size); |
3293 | 0 | uint32_t zeroEnergyY = 0; |
3294 | 0 | if (m_rdCost.m_psyRd) |
3295 | 0 | zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size); |
3296 | 0 | else if(m_rdCost.m_ssimRd) |
3297 | 0 | zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx); |
3298 | |
|
3299 | 0 | int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); |
3300 | 0 | uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size; |
3301 | |
|
3302 | 0 | if (cbfFlag[TEXT_LUMA][0]) |
3303 | 0 | { |
3304 | 0 | m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only |
3305 | | |
3306 | | // non-zero cost calculation for luma - This is an approximation |
3307 | | // finally we have to encode correct cbf after comparing with null cost |
3308 | 0 | pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); |
3309 | 0 | bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0; |
3310 | 0 | uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size; |
3311 | 0 | bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
3312 | 0 | bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0; |
3313 | 0 | bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0); |
3314 | 0 | primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY); |
3315 | |
|
3316 | 0 | const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY); |
3317 | 0 | uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); |
3318 | 0 | uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0; |
3319 | 0 | if (m_rdCost.m_psyRd) |
3320 | 0 | { |
3321 | 0 | nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY); |
3322 | 0 | singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY); |
3323 | 0 | } |
3324 | 0 | else if(m_rdCost.m_ssimRd) |
3325 | 0 | { |
3326 | 0 | nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx); |
3327 | 0 | singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY); |
3328 | 0 | } |
3329 | 0 | else |
3330 | 0 | singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]); |
3331 | |
|
3332 | 0 | if (cu.m_tqBypass[0]) |
3333 | 0 | { |
3334 | 0 | singleDist[TEXT_LUMA][0] = nonZeroDistY; |
3335 | 0 | singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY; |
3336 | 0 | } |
3337 | 0 | else |
3338 | 0 | { |
3339 | | // zero-cost calculation for luma. This is an approximation |
3340 | | // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf. |
3341 | | // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma. |
3342 | 0 | uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA); |
3343 | |
|
3344 | 0 | if (nullCostY < singleCostY) |
3345 | 0 | { |
3346 | 0 | cbfFlag[TEXT_LUMA][0] = 0; |
3347 | 0 | singleBits[TEXT_LUMA][0] = 0; |
3348 | 0 | primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0); |
3349 | | #if CHECKED_BUILD || _DEBUG |
3350 | | uint32_t numCoeffY = 1 << (log2TrSize << 1); |
3351 | | memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY); |
3352 | | #endif |
3353 | 0 | if (checkTransformSkipY) |
3354 | 0 | minCost[TEXT_LUMA][0] = nullCostY; |
3355 | 0 | singleDist[TEXT_LUMA][0] = zeroDistY; |
3356 | 0 | singleEnergy[TEXT_LUMA][0] = zeroEnergyY; |
3357 | 0 | } |
3358 | 0 | else |
3359 | 0 | { |
3360 | 0 | if (checkTransformSkipY) |
3361 | 0 | minCost[TEXT_LUMA][0] = singleCostY; |
3362 | 0 | singleDist[TEXT_LUMA][0] = nonZeroDistY; |
3363 | 0 | singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY; |
3364 | 0 | } |
3365 | 0 | } |
3366 | 0 | } |
3367 | 0 | else |
3368 | 0 | { |
3369 | 0 | if (checkTransformSkipY) |
3370 | 0 | minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA); |
3371 | 0 | primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0); |
3372 | 0 | singleDist[TEXT_LUMA][0] = zeroDistY; |
3373 | 0 | singleBits[TEXT_LUMA][0] = 0; |
3374 | 0 | singleEnergy[TEXT_LUMA][0] = zeroEnergyY; |
3375 | 0 | } |
3376 | |
|
3377 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); |
3378 | |
|
3379 | 0 | if (codeChroma) |
3380 | 0 | { |
3381 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
3382 | 0 | uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; |
3383 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
3384 | 0 | { |
3385 | 0 | sse_t zeroDistC = 0; |
3386 | 0 | uint32_t zeroEnergyC = 0; |
3387 | 0 | coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
3388 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
3389 | |
|
3390 | 0 | do |
3391 | 0 | { |
3392 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
3393 | 0 | uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); |
3394 | |
|
3395 | 0 | cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
3396 | |
|
3397 | 0 | if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) |
3398 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); |
3399 | |
|
3400 | 0 | fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); |
3401 | 0 | resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
3402 | 0 | numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); |
3403 | 0 | cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; |
3404 | |
|
3405 | 0 | uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits(); |
3406 | 0 | if (cbfFlag[chromaId][tuIterator.section]) |
3407 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); |
3408 | |
|
3409 | 0 | singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount; |
3410 | |
|
3411 | 0 | int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); |
3412 | 0 | zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize)); |
3413 | | |
3414 | | // Assuming zero residual |
3415 | 0 | if (m_rdCost.m_psyRd) |
3416 | 0 | zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize); |
3417 | 0 | else if(m_rdCost.m_ssimRd) |
3418 | 0 | zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC); |
3419 | |
|
3420 | 0 | if (cbfFlag[chromaId][tuIterator.section]) |
3421 | 0 | { |
3422 | 0 | m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset, |
3423 | 0 | log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); |
3424 | | |
3425 | | // non-zero cost calculation for luma, same as luma - This is an approximation |
3426 | | // finally we have to encode correct cbf after comparing with null cost |
3427 | 0 | pixel* curReconC = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); |
3428 | 0 | uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize; |
3429 | 0 | bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
3430 | 0 | bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
3431 | 0 | bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
3432 | 0 | bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0); |
3433 | 0 | primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC); |
3434 | 0 | sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC)); |
3435 | 0 | uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); |
3436 | 0 | uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0; |
3437 | 0 | if (m_rdCost.m_psyRd) |
3438 | 0 | { |
3439 | 0 | nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC); |
3440 | 0 | singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
3441 | 0 | } |
3442 | 0 | else if(m_rdCost.m_ssimRd) |
3443 | 0 | { |
3444 | 0 | nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC); |
3445 | 0 | singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
3446 | 0 | } |
3447 | 0 | else |
3448 | 0 | singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]); |
3449 | |
|
3450 | 0 | if (cu.m_tqBypass[0]) |
3451 | 0 | { |
3452 | 0 | singleDist[chromaId][tuIterator.section] = nonZeroDistC; |
3453 | 0 | singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC; |
3454 | 0 | } |
3455 | 0 | else |
3456 | 0 | { |
3457 | | //zero-cost calculation for chroma. This is an approximation |
3458 | 0 | uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId); |
3459 | |
|
3460 | 0 | if (nullCostC < singleCostC) |
3461 | 0 | { |
3462 | 0 | cbfFlag[chromaId][tuIterator.section] = 0; |
3463 | 0 | singleBits[chromaId][tuIterator.section] = 0; |
3464 | 0 | primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0); |
3465 | | #if CHECKED_BUILD || _DEBUG |
3466 | | uint32_t numCoeffC = 1 << (log2TrSizeC << 1); |
3467 | | memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC); |
3468 | | #endif |
3469 | 0 | if (checkTransformSkipC) |
3470 | 0 | minCost[chromaId][tuIterator.section] = nullCostC; |
3471 | 0 | singleDist[chromaId][tuIterator.section] = zeroDistC; |
3472 | 0 | singleEnergy[chromaId][tuIterator.section] = zeroEnergyC; |
3473 | 0 | } |
3474 | 0 | else |
3475 | 0 | { |
3476 | 0 | if (checkTransformSkipC) |
3477 | 0 | minCost[chromaId][tuIterator.section] = singleCostC; |
3478 | 0 | singleDist[chromaId][tuIterator.section] = nonZeroDistC; |
3479 | 0 | singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC; |
3480 | 0 | } |
3481 | 0 | } |
3482 | 0 | } |
3483 | 0 | else |
3484 | 0 | { |
3485 | 0 | if (checkTransformSkipC) |
3486 | 0 | minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId); |
3487 | 0 | primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0); |
3488 | 0 | singleBits[chromaId][tuIterator.section] = 0; |
3489 | 0 | singleDist[chromaId][tuIterator.section] = zeroDistC; |
3490 | 0 | singleEnergy[chromaId][tuIterator.section] = zeroEnergyC; |
3491 | 0 | } |
3492 | |
|
3493 | 0 | cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
3494 | 0 | } |
3495 | 0 | while (tuIterator.isNextSection()); |
3496 | 0 | } |
3497 | 0 | } |
3498 | |
|
3499 | 0 | if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400) |
3500 | 0 | { |
3501 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
3502 | 0 | { |
3503 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
3504 | 0 | do |
3505 | 0 | { |
3506 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
3507 | 0 | cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
3508 | 0 | } |
3509 | 0 | while(tuIterator.isNextSection()); |
3510 | 0 | } |
3511 | 0 | } |
3512 | 0 | if (checkTransformSkipY) |
3513 | 0 | { |
3514 | 0 | sse_t nonZeroDistY = 0; |
3515 | 0 | uint32_t nonZeroEnergyY = 0; |
3516 | 0 | uint64_t singleCostY = MAX_INT64; |
3517 | |
|
3518 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
3519 | |
|
3520 | 0 | cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth); |
3521 | |
|
3522 | 0 | if (bEnableRDOQ) |
3523 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
3524 | |
|
3525 | 0 | fenc = fencYuv->getLumaAddr(absPartIdx); |
3526 | 0 | resi = resiYuv.getLumaAddr(absPartIdx); |
3527 | 0 | uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true); |
3528 | |
|
3529 | 0 | if (numSigTSkipY) |
3530 | 0 | { |
3531 | 0 | m_entropyCoder.resetBits(); |
3532 | 0 | m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth); |
3533 | 0 | m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA); |
3534 | 0 | const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits(); |
3535 | |
|
3536 | 0 | m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY); |
3537 | 0 | bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
3538 | |
|
3539 | 0 | bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0); |
3540 | 0 | primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize); |
3541 | 0 | nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize); |
3542 | |
|
3543 | 0 | if (m_rdCost.m_psyRd) |
3544 | 0 | { |
3545 | 0 | nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize); |
3546 | 0 | singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY); |
3547 | 0 | } |
3548 | 0 | else if(m_rdCost.m_ssimRd) |
3549 | 0 | { |
3550 | 0 | nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx); |
3551 | 0 | singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY); |
3552 | 0 | } |
3553 | 0 | else |
3554 | 0 | singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY); |
3555 | 0 | } |
3556 | |
|
3557 | 0 | if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY) |
3558 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); |
3559 | 0 | else |
3560 | 0 | { |
3561 | 0 | singleDist[TEXT_LUMA][0] = nonZeroDistY; |
3562 | 0 | singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY; |
3563 | 0 | cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY; |
3564 | 0 | bestTransformMode[TEXT_LUMA][0] = 1; |
3565 | 0 | if (m_param->limitTU) |
3566 | 0 | numSig[TEXT_LUMA][0] = numSigTSkipY; |
3567 | 0 | uint32_t numCoeffY = 1 << (log2TrSize << 1); |
3568 | 0 | memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY); |
3569 | 0 | primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize); |
3570 | 0 | } |
3571 | |
|
3572 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); |
3573 | 0 | } |
3574 | |
|
3575 | 0 | if (codeChroma && checkTransformSkipC) |
3576 | 0 | { |
3577 | 0 | sse_t nonZeroDistC = 0; |
3578 | 0 | uint32_t nonZeroEnergyC = 0; |
3579 | 0 | uint64_t singleCostC = MAX_INT64; |
3580 | 0 | uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; |
3581 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
3582 | |
|
3583 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
3584 | |
|
3585 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
3586 | 0 | { |
3587 | 0 | coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
3588 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
3589 | |
|
3590 | 0 | do |
3591 | 0 | { |
3592 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
3593 | 0 | uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); |
3594 | |
|
3595 | 0 | int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); |
3596 | |
|
3597 | 0 | cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
3598 | |
|
3599 | 0 | if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) |
3600 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); |
3601 | |
|
3602 | 0 | fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); |
3603 | 0 | resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
3604 | 0 | uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); |
3605 | |
|
3606 | 0 | m_entropyCoder.resetBits(); |
3607 | 0 | singleBits[chromaId][tuIterator.section] = 0; |
3608 | |
|
3609 | 0 | if (numSigTSkipC) |
3610 | 0 | { |
3611 | 0 | m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth); |
3612 | 0 | m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId); |
3613 | 0 | singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); |
3614 | |
|
3615 | 0 | m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff, |
3616 | 0 | log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC); |
3617 | 0 | bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
3618 | 0 | bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0); |
3619 | 0 | primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC); |
3620 | 0 | nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC)); |
3621 | 0 | if (m_rdCost.m_psyRd) |
3622 | 0 | { |
3623 | 0 | nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC); |
3624 | 0 | singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
3625 | 0 | } |
3626 | 0 | else if(m_rdCost.m_ssimRd) |
3627 | 0 | { |
3628 | 0 | nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC); |
3629 | 0 | singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
3630 | 0 | } |
3631 | 0 | else |
3632 | 0 | singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]); |
3633 | 0 | } |
3634 | |
|
3635 | 0 | if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC) |
3636 | 0 | cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
3637 | 0 | else |
3638 | 0 | { |
3639 | 0 | singleDist[chromaId][tuIterator.section] = nonZeroDistC; |
3640 | 0 | singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC; |
3641 | 0 | cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC; |
3642 | 0 | bestTransformMode[chromaId][tuIterator.section] = 1; |
3643 | 0 | uint32_t numCoeffC = 1 << (log2TrSizeC << 1); |
3644 | 0 | memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC); |
3645 | 0 | primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC); |
3646 | 0 | } |
3647 | |
|
3648 | 0 | cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
3649 | 0 | } |
3650 | 0 | while (tuIterator.isNextSection()); |
3651 | 0 | } |
3652 | 0 | } |
3653 | | |
3654 | | // Here we were encoding cbfs and coefficients, after calculating distortion above. |
3655 | | // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected |
3656 | | // bits required for coefficients and added with number of cbf bits. As I tested the order does not |
3657 | | // make any difference. But bit confused whether I should load the original context as below. |
3658 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
3659 | 0 | m_entropyCoder.resetBits(); |
3660 | | |
3661 | | //Encode cbf flags |
3662 | 0 | if (codeChroma) |
3663 | 0 | { |
3664 | 0 | if (!splitIntoSubTUs) |
3665 | 0 | { |
3666 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth); |
3667 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth); |
3668 | 0 | } |
3669 | 0 | else |
3670 | 0 | { |
3671 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
3672 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
3673 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth); |
3674 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth); |
3675 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth); |
3676 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth); |
3677 | 0 | } |
3678 | 0 | } |
3679 | |
|
3680 | 0 | m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth); |
3681 | |
|
3682 | 0 | uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits(); |
3683 | |
|
3684 | 0 | uint32_t coeffBits = 0; |
3685 | 0 | coeffBits = singleBits[TEXT_LUMA][0]; |
3686 | 0 | for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) |
3687 | 0 | { |
3688 | 0 | coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex]; |
3689 | 0 | coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex]; |
3690 | 0 | } |
3691 | | |
3692 | | // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma. |
3693 | | // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for |
3694 | | // four split block's individual cbf value. This is not known before analysis of four split blocks. |
3695 | | // For that reason, I am collecting individual coefficient bits only. |
3696 | 0 | fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits; |
3697 | |
|
3698 | 0 | fullCost.distortion += singleDist[TEXT_LUMA][0]; |
3699 | 0 | fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also |
3700 | 0 | for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) |
3701 | 0 | { |
3702 | 0 | fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex]; |
3703 | 0 | fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex]; |
3704 | 0 | } |
3705 | |
|
3706 | 0 | if (m_rdCost.m_psyRd) |
3707 | 0 | fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
3708 | 0 | else if(m_rdCost.m_ssimRd) |
3709 | 0 | fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
3710 | 0 | else |
3711 | 0 | fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits); |
3712 | |
|
3713 | 0 | if (m_param->limitTU && bCheckSplit) |
3714 | 0 | { |
3715 | | // Stop recursion if the TU's energy level is minimal |
3716 | 0 | uint32_t numCoeff = trSize * trSize; |
3717 | 0 | if (cbfFlag[TEXT_LUMA][0] == 0) |
3718 | 0 | bCheckSplit = false; |
3719 | 0 | else if (numSig[TEXT_LUMA][0] < (numCoeff / 64)) |
3720 | 0 | { |
3721 | 0 | uint32_t energy = 0; |
3722 | 0 | for (uint32_t i = 0; i < numCoeff; i++) |
3723 | 0 | energy += abs(coeffCurY[i]); |
3724 | 0 | if (energy == numSig[TEXT_LUMA][0]) |
3725 | 0 | bCheckSplit = false; |
3726 | 0 | } |
3727 | 0 | } |
3728 | |
|
3729 | 0 | if (bSaveTUData) |
3730 | 0 | { |
3731 | 0 | for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++) |
3732 | 0 | { |
3733 | 0 | for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++) |
3734 | 0 | { |
3735 | 0 | m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part]; |
3736 | 0 | m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part]; |
3737 | 0 | } |
3738 | 0 | } |
3739 | 0 | m_cacheTU.cost[idx] = fullCost; |
3740 | 0 | m_entropyCoder.store(m_cacheTU.rqtStore[idx]); |
3741 | 0 | } |
3742 | 0 | } |
3743 | 0 | if (bLoadTUData) |
3744 | 0 | { |
3745 | 0 | for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++) |
3746 | 0 | { |
3747 | 0 | for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++) |
3748 | 0 | { |
3749 | 0 | bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part]; |
3750 | 0 | cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part]; |
3751 | 0 | } |
3752 | 0 | } |
3753 | 0 | fullCost = m_cacheTU.cost[idx]; |
3754 | 0 | m_entropyCoder.load(m_cacheTU.rqtStore[idx]); |
3755 | 0 | bCheckFull = true; |
3756 | 0 | } |
3757 | | |
3758 | | // code sub-blocks |
3759 | 0 | if (bCheckSplit) |
3760 | 0 | { |
3761 | 0 | if (bCheckFull) |
3762 | 0 | { |
3763 | 0 | m_entropyCoder.store(m_rqt[depth].rqtTest); |
3764 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
3765 | 0 | } |
3766 | |
|
3767 | 0 | Cost splitCost; |
3768 | 0 | if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) |
3769 | 0 | { |
3770 | | // Subdiv flag can be encoded at the start of analysis of split blocks. |
3771 | 0 | m_entropyCoder.resetBits(); |
3772 | 0 | m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); |
3773 | 0 | splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); |
3774 | 0 | } |
3775 | |
|
3776 | 0 | bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0); |
3777 | 0 | if (yCbCrCbf || !bCheckFull) |
3778 | 0 | { |
3779 | 0 | if (splitCost.rdcost < fullCost.rdcost) |
3780 | 0 | { |
3781 | 0 | if (m_limitTU & X265_TU_LIMIT_BFS) |
3782 | 0 | { |
3783 | 0 | uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1); |
3784 | 0 | bool nextSplit = nextlog2TrSize > depthRange[0]; |
3785 | 0 | if (nextSplit) |
3786 | 0 | { |
3787 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
3788 | 0 | splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0; |
3789 | 0 | if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) |
3790 | 0 | { |
3791 | | // Subdiv flag can be encoded at the start of analysis of split blocks. |
3792 | 0 | m_entropyCoder.resetBits(); |
3793 | 0 | m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); |
3794 | 0 | splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); |
3795 | 0 | } |
3796 | 0 | splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1); |
3797 | 0 | } |
3798 | 0 | } |
3799 | 0 | outCosts.distortion += splitCost.distortion; |
3800 | 0 | outCosts.rdcost += splitCost.rdcost; |
3801 | 0 | outCosts.bits += splitCost.bits; |
3802 | 0 | outCosts.energy += splitCost.energy; |
3803 | 0 | return; |
3804 | 0 | } |
3805 | 0 | else |
3806 | 0 | outCosts.energy += splitCost.energy; |
3807 | 0 | } |
3808 | | |
3809 | 0 | cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth); |
3810 | 0 | if (codeChroma) |
3811 | 0 | { |
3812 | 0 | if (!splitIntoSubTUs) |
3813 | 0 | { |
3814 | 0 | cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth); |
3815 | 0 | cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth); |
3816 | 0 | } |
3817 | 0 | else |
3818 | 0 | { |
3819 | 0 | uint32_t tuNumParts = absPartIdxStep >> 1; |
3820 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts); |
3821 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); |
3822 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts); |
3823 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); |
3824 | 0 | } |
3825 | 0 | } |
3826 | 0 | X265_CHECK(bCheckFull, "check-full must be set\n"); |
3827 | 0 | m_entropyCoder.load(m_rqt[depth].rqtTest); |
3828 | 0 | } |
3829 | | |
3830 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); |
3831 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); |
3832 | |
|
3833 | 0 | if (codeChroma) |
3834 | 0 | { |
3835 | 0 | if (!splitIntoSubTUs) |
3836 | 0 | { |
3837 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth); |
3838 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth); |
3839 | 0 | } |
3840 | 0 | else |
3841 | 0 | { |
3842 | 0 | uint32_t tuNumParts = absPartIdxStep >> 1; |
3843 | |
|
3844 | 0 | offsetCBFs(cbfFlag[TEXT_CHROMA_U]); |
3845 | 0 | offsetCBFs(cbfFlag[TEXT_CHROMA_V]); |
3846 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts); |
3847 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); |
3848 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts); |
3849 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); |
3850 | 0 | } |
3851 | 0 | } |
3852 | |
|
3853 | 0 | outCosts.distortion += fullCost.distortion; |
3854 | 0 | outCosts.rdcost += fullCost.rdcost; |
3855 | 0 | outCosts.bits += fullCost.bits; |
3856 | 0 | outCosts.energy += fullCost.energy; |
3857 | 0 | } |
3858 | | |
3859 | | void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]) |
3860 | 0 | { |
3861 | 0 | X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n"); |
3862 | |
|
3863 | 0 | const bool bSubdiv = tuDepth < cu.m_tuDepth[absPartIdx]; |
3864 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
3865 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
3866 | 0 | { |
3867 | 0 | if (!(log2TrSize - m_hChromaShift < 2)) |
3868 | 0 | { |
3869 | 0 | uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2); |
3870 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1)) |
3871 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv); |
3872 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1)) |
3873 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv); |
3874 | 0 | } |
3875 | 0 | } |
3876 | |
|
3877 | 0 | if (!bSubdiv) |
3878 | 0 | { |
3879 | 0 | m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); |
3880 | 0 | } |
3881 | 0 | else |
3882 | 0 | { |
3883 | 0 | uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2; |
3884 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
3885 | 0 | codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange); |
3886 | 0 | } |
3887 | 0 | } |
3888 | | |
3889 | | void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth) |
3890 | 0 | { |
3891 | 0 | const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
3892 | |
|
3893 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
3894 | 0 | { |
3895 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
3896 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
3897 | 0 | saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1); |
3898 | 0 | return; |
3899 | 0 | } |
3900 | | |
3901 | 0 | const uint32_t qtLayer = log2TrSize - 2; |
3902 | |
|
3903 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
3904 | 0 | uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0; |
3905 | 0 | uint32_t tuDepthC = tuDepth; |
3906 | 0 | if (log2TrSizeC < 2) |
3907 | 0 | { |
3908 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
3909 | 0 | log2TrSizeC = 2; |
3910 | 0 | tuDepthC--; |
3911 | 0 | codeChroma &= !(absPartIdx & 3); |
3912 | 0 | } |
3913 | |
|
3914 | 0 | m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize); |
3915 | |
|
3916 | 0 | uint32_t numCoeffY = 1 << (log2TrSize * 2); |
3917 | 0 | uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2; |
3918 | 0 | coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
3919 | 0 | coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY; |
3920 | 0 | memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY); |
3921 | |
|
3922 | 0 | if (codeChroma) |
3923 | 0 | { |
3924 | 0 | m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift); |
3925 | |
|
3926 | 0 | uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); |
3927 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
3928 | |
|
3929 | 0 | coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; |
3930 | 0 | coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; |
3931 | 0 | coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; |
3932 | 0 | coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC; |
3933 | 0 | memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC); |
3934 | 0 | memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC); |
3935 | 0 | } |
3936 | 0 | } |
3937 | | |
3938 | | /* returns the number of bits required to signal a non-most-probable mode. |
3939 | | * on return mpms contains bitmap of most probable modes */ |
3940 | | uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const |
3941 | 0 | { |
3942 | 0 | cu.getIntraDirLumaPredictor(absPartIdx, mpmModes); |
3943 | |
|
3944 | 0 | mpms = 0; |
3945 | 0 | for (int i = 0; i < 3; ++i) |
3946 | 0 | mpms |= ((uint64_t)1 << mpmModes[i]); |
3947 | |
|
3948 | 0 | return m_entropyCoder.bitsIntraModeNonMPM(); |
3949 | 0 | } |
3950 | | |
3951 | | /* swap the current mode/cost with the mode with the highest cost in the |
3952 | | * current candidate list, if its cost is better (maintain a top N list) */ |
3953 | | void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList) |
3954 | 0 | { |
3955 | 0 | uint32_t maxIndex = 0; |
3956 | 0 | uint64_t maxValue = 0; |
3957 | |
|
3958 | 0 | for (int i = 0; i < maxCandCount; i++) |
3959 | 0 | { |
3960 | 0 | if (maxValue < candCostList[i]) |
3961 | 0 | { |
3962 | 0 | maxValue = candCostList[i]; |
3963 | 0 | maxIndex = i; |
3964 | 0 | } |
3965 | 0 | } |
3966 | |
|
3967 | 0 | if (cost < maxValue) |
3968 | 0 | { |
3969 | 0 | candCostList[maxIndex] = cost; |
3970 | 0 | candModeList[maxIndex] = mode; |
3971 | 0 | } |
3972 | 0 | } |
3973 | | |
3974 | | void Search::checkDQP(Mode& mode, const CUGeom& cuGeom) |
3975 | 0 | { |
3976 | 0 | CUData& cu = mode.cu; |
3977 | 0 | if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth) |
3978 | 0 | { |
3979 | 0 | if (cu.getQtRootCbf(0)) |
3980 | 0 | { |
3981 | 0 | if (m_param->rdLevel >= 3) |
3982 | 0 | { |
3983 | 0 | mode.contexts.resetBits(); |
3984 | 0 | mode.contexts.codeDeltaQP(cu, 0); |
3985 | 0 | uint32_t bits = mode.contexts.getNumberOfWrittenBits(); |
3986 | 0 | mode.totalBits += bits; |
3987 | 0 | updateModeCost(mode); |
3988 | 0 | } |
3989 | 0 | else if (m_param->rdLevel <= 1) |
3990 | 0 | { |
3991 | 0 | mode.sa8dBits++; |
3992 | 0 | mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits); |
3993 | 0 | } |
3994 | 0 | else |
3995 | 0 | { |
3996 | 0 | mode.totalBits++; |
3997 | 0 | updateModeCost(mode); |
3998 | 0 | } |
3999 | 0 | } |
4000 | 0 | else |
4001 | 0 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); |
4002 | 0 | } |
4003 | 0 | } |
4004 | | |
4005 | | void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom) |
4006 | 0 | { |
4007 | 0 | CUData& cu = mode.cu; |
4008 | |
|
4009 | 0 | if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP) |
4010 | 0 | { |
4011 | 0 | bool hasResidual = false; |
4012 | | |
4013 | | /* Check if any sub-CU has a non-zero QP */ |
4014 | 0 | for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++) |
4015 | 0 | { |
4016 | 0 | if (cu.getQtRootCbf(blkIdx)) |
4017 | 0 | { |
4018 | 0 | hasResidual = true; |
4019 | 0 | break; |
4020 | 0 | } |
4021 | 0 | } |
4022 | 0 | if (hasResidual) |
4023 | 0 | { |
4024 | 0 | if (m_param->rdLevel >= 3) |
4025 | 0 | { |
4026 | 0 | mode.contexts.resetBits(); |
4027 | 0 | mode.contexts.codeDeltaQP(cu, 0); |
4028 | 0 | uint32_t bits = mode.contexts.getNumberOfWrittenBits(); |
4029 | 0 | mode.totalBits += bits; |
4030 | 0 | updateModeCost(mode); |
4031 | 0 | } |
4032 | 0 | else if (m_param->rdLevel <= 1) |
4033 | 0 | { |
4034 | 0 | mode.sa8dBits++; |
4035 | 0 | mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits); |
4036 | 0 | } |
4037 | 0 | else |
4038 | 0 | { |
4039 | 0 | mode.totalBits++; |
4040 | 0 | updateModeCost(mode); |
4041 | 0 | } |
4042 | | /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled). |
4043 | | When the non-zero CBF sub-CU is found, stop */ |
4044 | 0 | cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); |
4045 | 0 | } |
4046 | 0 | else |
4047 | | /* No residual within this CU or subCU, so reset QP to RefQP */ |
4048 | 0 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); |
4049 | 0 | } |
4050 | 0 | } |