/src/x265/source/encoder/search.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2020 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Steve Borho <steve@borho.org> |
5 | | * Min Chen <chenm003@163.com> |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 2 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
20 | | * |
21 | | * This program is also available under a commercial proprietary license. |
22 | | * For more information, contact us at license @ x265.com. |
23 | | *****************************************************************************/ |
24 | | |
25 | | #include "common.h" |
26 | | #include "primitives.h" |
27 | | #include "picyuv.h" |
28 | | #include "cudata.h" |
29 | | |
30 | | #include "search.h" |
31 | | #include "entropy.h" |
32 | | #include "rdcost.h" |
33 | | |
34 | | #include "analysis.h" // TLD |
35 | | #include "framedata.h" |
36 | | |
37 | | using namespace X265_NS; |
38 | | |
39 | | #if _MSC_VER |
40 | | #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning) |
41 | | #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data) |
42 | | #pragma warning(disable: 4127) // conditional expression is constant |
43 | | #endif |
44 | | |
45 | 0 | #define MVP_IDX_BITS 1 |
46 | | |
47 | | ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 }; |
48 | | |
49 | | Search::Search() |
50 | 0 | { |
51 | 0 | memset(m_rqt, 0, sizeof(m_rqt)); |
52 | |
|
53 | 0 | for (int i = 0; i < 3; i++) |
54 | 0 | { |
55 | 0 | m_qtTempTransformSkipFlag[i] = NULL; |
56 | 0 | m_qtTempCbf[i] = NULL; |
57 | 0 | } |
58 | |
|
59 | 0 | m_numLayers = 0; |
60 | 0 | m_intraPred = NULL; |
61 | 0 | m_intraPredAngs = NULL; |
62 | 0 | m_fencScaled = NULL; |
63 | 0 | m_fencTransposed = NULL; |
64 | 0 | m_tsCoeff = NULL; |
65 | 0 | m_tsResidual = NULL; |
66 | 0 | m_tsRecon = NULL; |
67 | 0 | m_param = NULL; |
68 | 0 | m_slice = NULL; |
69 | 0 | m_frame = NULL; |
70 | 0 | m_maxTUDepth = -1; |
71 | 0 | } |
72 | | |
73 | | bool Search::initSearch(const x265_param& param, ScalingList& scalingList) |
74 | 0 | { |
75 | 0 | uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize]; |
76 | 0 | m_param = ¶m; |
77 | 0 | m_bFrameParallel = param.frameNumThreads > 1; |
78 | 0 | m_numLayers = g_log2Size[param.maxCUSize] - 2; |
79 | | #if ENABLE_SCC_EXT |
80 | | m_ibcEnabled = param.bEnableSCC; |
81 | | #endif |
82 | |
|
83 | 0 | m_rdCost.setPsyRdScale(param.psyRd); |
84 | 0 | m_rdCost.setSsimRd(param.bSsimRd); |
85 | 0 | m_me.init(param.internalCsp); |
86 | |
|
87 | 0 | bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder); |
88 | 0 | if (m_param->noiseReductionIntra || m_param->noiseReductionInter ) |
89 | 0 | ok &= m_quant.allocNoiseReduction(param); |
90 | |
|
91 | 0 | ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */ |
92 | | |
93 | | /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed |
94 | | * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */ |
95 | 0 | m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight; |
96 | |
|
97 | 0 | uint32_t sizeL = 1 << (maxLog2CUSize * 2); |
98 | 0 | uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); |
99 | 0 | uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2; |
100 | |
|
101 | 0 | m_limitTU = 0; |
102 | 0 | if (m_param->limitTU) |
103 | 0 | { |
104 | 0 | if (m_param->limitTU == 1) |
105 | 0 | m_limitTU = X265_TU_LIMIT_BFS; |
106 | 0 | else if (m_param->limitTU == 2) |
107 | 0 | m_limitTU = X265_TU_LIMIT_DFS; |
108 | 0 | else if (m_param->limitTU == 3) |
109 | 0 | m_limitTU = X265_TU_LIMIT_NEIGH; |
110 | 0 | else if (m_param->limitTU == 4) |
111 | 0 | m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH; |
112 | 0 | } |
113 | | |
114 | | /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32 |
115 | | * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts |
116 | | * which are reconstructed at each depth are valid. At the end, the transform depth table |
117 | | * is walked and the coeff and recon at the correct depths are collected */ |
118 | |
|
119 | 0 | if (param.internalCsp != X265_CSP_I400) |
120 | 0 | { |
121 | 0 | for (uint32_t i = 0; i <= m_numLayers; i++) |
122 | 0 | { |
123 | 0 | CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2); |
124 | 0 | m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL; |
125 | 0 | m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC; |
126 | 0 | ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp); |
127 | 0 | ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp); |
128 | 0 | } |
129 | 0 | } |
130 | 0 | else |
131 | 0 | { |
132 | 0 | for (uint32_t i = 0; i <= m_numLayers; i++) |
133 | 0 | { |
134 | 0 | CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL); |
135 | 0 | m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL; |
136 | 0 | ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp); |
137 | 0 | ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp); |
138 | 0 | } |
139 | 0 | } |
140 | | |
141 | | /* the rest of these buffers are indexed per-depth */ |
142 | 0 | for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) |
143 | 0 | { |
144 | 0 | int cuSize = param.maxCUSize >> i; |
145 | 0 | ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp); |
146 | 0 | ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp); |
147 | 0 | ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp); |
148 | 0 | ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp); |
149 | 0 | } |
150 | |
|
151 | 0 | if (param.internalCsp != X265_CSP_I400) |
152 | 0 | { |
153 | 0 | CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3); |
154 | 0 | m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions; |
155 | 0 | m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2; |
156 | 0 | CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3); |
157 | 0 | m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions; |
158 | 0 | m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2; |
159 | 0 | } |
160 | 0 | else |
161 | 0 | { |
162 | 0 | CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions); |
163 | 0 | m_qtTempCbf[1] = m_qtTempCbf[2] = NULL; |
164 | 0 | CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions); |
165 | 0 | m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL; |
166 | 0 | } |
167 | | |
168 | 0 | CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3)); |
169 | 0 | m_fencScaled = m_intraPred + 32 * 32; |
170 | 0 | m_fencTransposed = m_fencScaled + 32 * 32; |
171 | 0 | m_intraPredAngs = m_fencTransposed + 32 * 32; |
172 | |
|
173 | 0 | CHECKED_MALLOC(m_tsCoeff, coeff_t, MAX_TS_SIZE * MAX_TS_SIZE); |
174 | 0 | CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE); |
175 | 0 | CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE); |
176 | |
|
177 | | #if ENABLE_SCC_EXT |
178 | | m_numBVs = 0; |
179 | | m_numBV16s = 0; |
180 | | #endif |
181 | |
|
182 | 0 | return ok; |
183 | | |
184 | 0 | fail: |
185 | 0 | return false; |
186 | 0 | } |
187 | | |
188 | | Search::~Search() |
189 | 0 | { |
190 | 0 | for (uint32_t i = 0; i <= m_numLayers; i++) |
191 | 0 | { |
192 | 0 | X265_FREE(m_rqt[i].coeffRQT[0]); |
193 | 0 | m_rqt[i].reconQtYuv.destroy(); |
194 | 0 | m_rqt[i].resiQtYuv.destroy(); |
195 | 0 | } |
196 | |
|
197 | 0 | for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) |
198 | 0 | { |
199 | 0 | m_rqt[i].tmpResiYuv.destroy(); |
200 | 0 | m_rqt[i].tmpPredYuv.destroy(); |
201 | 0 | m_rqt[i].bidirPredYuv[0].destroy(); |
202 | 0 | m_rqt[i].bidirPredYuv[1].destroy(); |
203 | 0 | } |
204 | |
|
205 | 0 | X265_FREE(m_qtTempCbf[0]); |
206 | 0 | X265_FREE(m_qtTempTransformSkipFlag[0]); |
207 | 0 | X265_FREE(m_intraPred); |
208 | 0 | X265_FREE(m_tsCoeff); |
209 | 0 | X265_FREE(m_tsResidual); |
210 | 0 | X265_FREE(m_tsRecon); |
211 | 0 | } |
212 | | |
213 | | int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp) |
214 | 0 | { |
215 | 0 | X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n"); |
216 | |
|
217 | 0 | m_me.setQP(qp); |
218 | 0 | m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp); |
219 | |
|
220 | 0 | int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp); |
221 | 0 | m_quant.setQPforQuant(ctu, quantQP); |
222 | 0 | return quantQP; |
223 | 0 | } |
224 | | |
225 | | #if CHECKED_BUILD || _DEBUG |
226 | | void Search::invalidateContexts(int fromDepth) |
227 | | { |
228 | | /* catch reads without previous writes */ |
229 | | for (int d = fromDepth; d < NUM_FULL_DEPTH; d++) |
230 | | { |
231 | | m_rqt[d].cur.markInvalid(); |
232 | | m_rqt[d].rqtTemp.markInvalid(); |
233 | | m_rqt[d].rqtRoot.markInvalid(); |
234 | | m_rqt[d].rqtTest.markInvalid(); |
235 | | } |
236 | | } |
237 | | #else |
238 | 0 | void Search::invalidateContexts(int) {} |
239 | | #endif |
240 | | |
241 | | void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx) |
242 | 0 | { |
243 | 0 | uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx]; |
244 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
245 | |
|
246 | 0 | if (!(log2TrSize - m_hChromaShift < 2)) |
247 | 0 | { |
248 | 0 | uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2); |
249 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1)) |
250 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv); |
251 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1)) |
252 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv); |
253 | 0 | } |
254 | |
|
255 | 0 | if (subdiv) |
256 | 0 | { |
257 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
258 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
259 | 0 | codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx); |
260 | 0 | } |
261 | 0 | } |
262 | | |
263 | | void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype) |
264 | 0 | { |
265 | 0 | if (!cu.getCbf(absPartIdx, ttype, tuDepth)) |
266 | 0 | return; |
267 | | |
268 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
269 | |
|
270 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
271 | 0 | { |
272 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
273 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
274 | 0 | codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype); |
275 | |
|
276 | 0 | return; |
277 | 0 | } |
278 | | |
279 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
280 | |
|
281 | 0 | if (log2TrSizeC < 2) |
282 | 0 | { |
283 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
284 | 0 | if (absPartIdx & 3) |
285 | 0 | return; |
286 | 0 | log2TrSizeC = 2; |
287 | 0 | } |
288 | | |
289 | 0 | uint32_t qtLayer = log2TrSize - 2; |
290 | |
|
291 | 0 | if (m_csp != X265_CSP_I422) |
292 | 0 | { |
293 | 0 | uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0; |
294 | 0 | uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift); |
295 | 0 | coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; |
296 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); |
297 | 0 | } |
298 | 0 | else |
299 | 0 | { |
300 | 0 | uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1); |
301 | 0 | coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; |
302 | 0 | uint32_t subTUSize = 1 << (log2TrSizeC * 2); |
303 | 0 | uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2); |
304 | 0 | if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) |
305 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); |
306 | 0 | if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) |
307 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype); |
308 | 0 | } |
309 | 0 | } |
310 | | |
311 | | void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2]) |
312 | 0 | { |
313 | 0 | CUData& cu = mode.cu; |
314 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
315 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
316 | 0 | uint32_t qtLayer = log2TrSize - 2; |
317 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
318 | 0 | bool mightNotSplit = log2TrSize <= depthRange[1]; |
319 | 0 | bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit); |
320 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
321 | | |
322 | | /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */ |
323 | 0 | if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4) |
324 | 0 | { |
325 | 0 | mightNotSplit = false; |
326 | 0 | mightSplit = true; |
327 | 0 | } |
328 | |
|
329 | 0 | Cost fullCost; |
330 | 0 | uint32_t bCBF = 0; |
331 | |
|
332 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); |
333 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size; |
334 | |
|
335 | 0 | if (mightNotSplit) |
336 | 0 | { |
337 | 0 | if (mightSplit) |
338 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); |
339 | |
|
340 | 0 | const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); |
341 | 0 | pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); |
342 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); |
343 | 0 | uint32_t stride = mode.fencYuv->m_size; |
344 | | |
345 | | // init availability pattern |
346 | 0 | uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; |
347 | 0 | IntraNeighbors intraNeighbors; |
348 | 0 | initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors); |
349 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode); |
350 | | |
351 | | // get prediction signal |
352 | 0 | predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); |
353 | |
|
354 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
355 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
356 | |
|
357 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
358 | 0 | coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
359 | | |
360 | | // store original entropy coding status |
361 | 0 | if (bEnableRDOQ) |
362 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
363 | 0 | primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
364 | |
|
365 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
366 | 0 | if (numSig) |
367 | 0 | { |
368 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig); |
369 | 0 | bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
370 | 0 | bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
371 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
372 | 0 | bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign; |
373 | 0 | primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride); |
374 | 0 | } |
375 | 0 | else |
376 | | // no coded residual, recon = pred |
377 | 0 | primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride); |
378 | |
|
379 | 0 | bCBF = !!numSig << tuDepth; |
380 | 0 | cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); |
381 | 0 | fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride); |
382 | |
|
383 | 0 | m_entropyCoder.resetBits(); |
384 | 0 | if (!absPartIdx) |
385 | 0 | { |
386 | 0 | if (!cu.m_slice->isIntra()) |
387 | 0 | { |
388 | 0 | if (cu.m_slice->m_pps->bTransquantBypassEnabled) |
389 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
390 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
391 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
392 | 0 | } |
393 | |
|
394 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
395 | 0 | } |
396 | 0 | if (cu.m_partSize[0] == SIZE_2Nx2N) |
397 | 0 | { |
398 | 0 | if (!absPartIdx) |
399 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, 0, false); |
400 | 0 | } |
401 | 0 | else |
402 | 0 | { |
403 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
404 | 0 | if (!tuDepth) |
405 | 0 | { |
406 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) |
407 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); |
408 | 0 | } |
409 | 0 | else if (!(absPartIdx & (qNumParts - 1))) |
410 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); |
411 | 0 | } |
412 | 0 | if (log2TrSize != depthRange[0]) |
413 | 0 | m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); |
414 | |
|
415 | 0 | m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); |
416 | |
|
417 | 0 | if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) |
418 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA); |
419 | |
|
420 | 0 | fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); |
421 | |
|
422 | 0 | if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE) |
423 | 0 | fullCost.bits *= 4; |
424 | |
|
425 | 0 | if (m_rdCost.m_psyRd) |
426 | 0 | { |
427 | 0 | fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride); |
428 | 0 | fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
429 | 0 | } |
430 | 0 | else if(m_rdCost.m_ssimRd) |
431 | 0 | { |
432 | 0 | fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx); |
433 | 0 | fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
434 | 0 | } |
435 | 0 | else |
436 | 0 | fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits); |
437 | 0 | } |
438 | 0 | else |
439 | 0 | fullCost.rdcost = MAX_INT64; |
440 | |
|
441 | 0 | if (mightSplit) |
442 | 0 | { |
443 | 0 | if (mightNotSplit) |
444 | 0 | { |
445 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode |
446 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode |
447 | 0 | } |
448 | | |
449 | | /* code split block */ |
450 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
451 | |
|
452 | 0 | int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; |
453 | 0 | if (m_param->bEnableTSkipFast) |
454 | 0 | checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N; |
455 | |
|
456 | 0 | Cost splitCost; |
457 | 0 | uint32_t cbf = 0; |
458 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
459 | 0 | { |
460 | 0 | if (checkTransformSkip) |
461 | 0 | codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost); |
462 | 0 | else |
463 | 0 | codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange); |
464 | |
|
465 | 0 | cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
466 | 0 | } |
467 | 0 | cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth); |
468 | |
|
469 | 0 | if (mightNotSplit && log2TrSize != depthRange[0]) |
470 | 0 | { |
471 | | /* If we could have coded this TU depth, include cost of subdiv flag */ |
472 | 0 | m_entropyCoder.resetBits(); |
473 | 0 | m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); |
474 | 0 | splitCost.bits += m_entropyCoder.getNumberOfWrittenBits(); |
475 | |
|
476 | 0 | if (m_rdCost.m_psyRd) |
477 | 0 | splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
478 | 0 | else if(m_rdCost.m_ssimRd) |
479 | 0 | splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
480 | 0 | else |
481 | 0 | splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits); |
482 | 0 | } |
483 | |
|
484 | 0 | if (splitCost.rdcost < fullCost.rdcost) |
485 | 0 | { |
486 | 0 | outCost.rdcost += splitCost.rdcost; |
487 | 0 | outCost.distortion += splitCost.distortion; |
488 | 0 | outCost.bits += splitCost.bits; |
489 | 0 | outCost.energy += splitCost.energy; |
490 | 0 | return; |
491 | 0 | } |
492 | 0 | else |
493 | 0 | { |
494 | | // recover entropy state of full-size TU encode |
495 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtTest); |
496 | | |
497 | | // recover transform index and Cbf values |
498 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
499 | 0 | cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); |
500 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
501 | 0 | } |
502 | 0 | } |
503 | | |
504 | | // set reconstruction for next intra prediction blocks if full TU prediction won |
505 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
506 | 0 | pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
507 | 0 | intptr_t picStride = reconPic->m_stride; |
508 | 0 | primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride); |
509 | |
|
510 | 0 | outCost.rdcost += fullCost.rdcost; |
511 | 0 | outCost.distortion += fullCost.distortion; |
512 | 0 | outCost.bits += fullCost.bits; |
513 | 0 | outCost.energy += fullCost.energy; |
514 | 0 | } |
515 | | |
516 | | void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost) |
517 | 0 | { |
518 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
519 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
520 | 0 | uint32_t tuSize = 1 << log2TrSize; |
521 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
522 | |
|
523 | 0 | X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n"); |
524 | |
|
525 | 0 | CUData& cu = mode.cu; |
526 | 0 | Yuv* predYuv = &mode.predYuv; |
527 | 0 | const Yuv* fencYuv = mode.fencYuv; |
528 | |
|
529 | 0 | Cost fullCost; |
530 | 0 | fullCost.rdcost = MAX_INT64; |
531 | 0 | int bTSkip = 0; |
532 | 0 | uint32_t bCBF = 0; |
533 | |
|
534 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
535 | 0 | pixel* pred = predYuv->getLumaAddr(absPartIdx); |
536 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); |
537 | 0 | uint32_t stride = fencYuv->m_size; |
538 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
539 | | |
540 | | // init availability pattern |
541 | 0 | uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; |
542 | 0 | IntraNeighbors intraNeighbors; |
543 | 0 | initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors); |
544 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode); |
545 | | |
546 | | // get prediction signal |
547 | 0 | predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); |
548 | |
|
549 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
550 | |
|
551 | 0 | uint32_t qtLayer = log2TrSize - 2; |
552 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
553 | 0 | coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
554 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); |
555 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size; |
556 | | |
557 | | // store original entropy coding status |
558 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); |
559 | |
|
560 | 0 | if (bEnableRDOQ) |
561 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
562 | |
|
563 | 0 | int checkTransformSkip = 1; |
564 | 0 | for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) |
565 | 0 | { |
566 | 0 | uint64_t tmpCost; |
567 | 0 | uint32_t tmpEnergy = 0; |
568 | |
|
569 | 0 | coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY); |
570 | 0 | pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt); |
571 | 0 | bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0)); |
572 | 0 | uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); |
573 | |
|
574 | 0 | primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
575 | |
|
576 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip); |
577 | 0 | if (numSig) |
578 | 0 | { |
579 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig); |
580 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0; |
581 | 0 | bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0; |
582 | 0 | bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign; |
583 | 0 | primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride); |
584 | 0 | } |
585 | 0 | else if (useTSkip) |
586 | 0 | { |
587 | | /* do not allow tskip if CBF=0, pretend we did not try tskip */ |
588 | 0 | checkTransformSkip = 0; |
589 | 0 | break; |
590 | 0 | } |
591 | 0 | else |
592 | | // no residual coded, recon = pred |
593 | 0 | primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride); |
594 | | |
595 | 0 | sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride); |
596 | |
|
597 | 0 | cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth); |
598 | 0 | cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); |
599 | |
|
600 | 0 | if (useTSkip) |
601 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); |
602 | |
|
603 | 0 | m_entropyCoder.resetBits(); |
604 | 0 | if (!absPartIdx) |
605 | 0 | { |
606 | 0 | if (!cu.m_slice->isIntra()) |
607 | 0 | { |
608 | 0 | if (cu.m_slice->m_pps->bTransquantBypassEnabled) |
609 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
610 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
611 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
612 | 0 | } |
613 | |
|
614 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
615 | 0 | } |
616 | 0 | if (cu.m_partSize[0] == SIZE_2Nx2N) |
617 | 0 | { |
618 | 0 | if (!absPartIdx) |
619 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, 0, false); |
620 | 0 | } |
621 | 0 | else |
622 | 0 | { |
623 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
624 | 0 | if (!tuDepth) |
625 | 0 | { |
626 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) |
627 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); |
628 | 0 | } |
629 | 0 | else if (!(absPartIdx & (qNumParts - 1))) |
630 | 0 | m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); |
631 | 0 | } |
632 | 0 | m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); |
633 | |
|
634 | 0 | m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); |
635 | |
|
636 | 0 | if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) |
637 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA); |
638 | |
|
639 | 0 | uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits(); |
640 | |
|
641 | 0 | if (!useTSkip) |
642 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtTemp); |
643 | |
|
644 | 0 | if (m_rdCost.m_psyRd) |
645 | 0 | { |
646 | 0 | tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride); |
647 | 0 | tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy); |
648 | 0 | } |
649 | 0 | else if(m_rdCost.m_ssimRd) |
650 | 0 | { |
651 | 0 | tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, tmpRecon, tmpReconStride, log2TrSize, TEXT_LUMA, absPartIdx); |
652 | 0 | tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy); |
653 | 0 | } |
654 | 0 | else |
655 | 0 | tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits); |
656 | |
|
657 | 0 | if (tmpCost < fullCost.rdcost) |
658 | 0 | { |
659 | 0 | bTSkip = useTSkip; |
660 | 0 | bCBF = !!numSig; |
661 | 0 | fullCost.rdcost = tmpCost; |
662 | 0 | fullCost.distortion = tmpDist; |
663 | 0 | fullCost.bits = tmpBits; |
664 | 0 | fullCost.energy = tmpEnergy; |
665 | 0 | } |
666 | 0 | } |
667 | |
|
668 | 0 | if (bTSkip) |
669 | 0 | { |
670 | 0 | memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2)); |
671 | 0 | primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize); |
672 | 0 | } |
673 | 0 | else if (checkTransformSkip) |
674 | 0 | { |
675 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
676 | 0 | cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); |
677 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtTemp); |
678 | 0 | } |
679 | | |
680 | | // set reconstruction for next intra prediction blocks |
681 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
682 | 0 | pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
683 | 0 | intptr_t picStride = reconPic->m_stride; |
684 | 0 | primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride); |
685 | |
|
686 | 0 | outCost.rdcost += fullCost.rdcost; |
687 | 0 | outCost.distortion += fullCost.distortion; |
688 | 0 | outCost.bits += fullCost.bits; |
689 | 0 | outCost.energy += fullCost.energy; |
690 | 0 | } |
691 | | |
692 | | /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */ |
693 | | void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]) |
694 | 0 | { |
695 | 0 | CUData& cu = mode.cu; |
696 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
697 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
698 | 0 | bool bCheckFull = log2TrSize <= depthRange[1]; |
699 | |
|
700 | 0 | X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n"); |
701 | | |
702 | | /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible |
703 | | * since we are not measuring RD cost */ |
704 | 0 | if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4) |
705 | 0 | bCheckFull = false; |
706 | |
|
707 | 0 | if (bCheckFull) |
708 | 0 | { |
709 | 0 | const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); |
710 | 0 | pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); |
711 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); |
712 | 0 | uint32_t stride = mode.fencYuv->m_size; |
713 | | |
714 | | // init availability pattern |
715 | 0 | uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; |
716 | 0 | IntraNeighbors intraNeighbors; |
717 | 0 | initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors); |
718 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode); |
719 | | |
720 | | // get prediction signal |
721 | 0 | predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); |
722 | |
|
723 | 0 | X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n"); |
724 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); |
725 | |
|
726 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
727 | 0 | coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY; |
728 | |
|
729 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
730 | 0 | primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
731 | |
|
732 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
733 | 0 | pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
734 | 0 | intptr_t picStride = reconPic->m_stride; |
735 | |
|
736 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
737 | 0 | if (numSig) |
738 | 0 | { |
739 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig); |
740 | 0 | bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0; |
741 | 0 | bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
742 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0; |
743 | 0 | bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign; |
744 | 0 | primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride); |
745 | 0 | cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); |
746 | 0 | } |
747 | 0 | else |
748 | 0 | { |
749 | 0 | primitives.cu[sizeIdx].copy_pp(picReconY, picStride, pred, stride); |
750 | 0 | cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); |
751 | 0 | } |
752 | 0 | } |
753 | 0 | else |
754 | 0 | { |
755 | 0 | X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n"); |
756 | | |
757 | | /* code split block */ |
758 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
759 | 0 | uint32_t cbf = 0; |
760 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
761 | 0 | { |
762 | 0 | residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange); |
763 | 0 | cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
764 | 0 | } |
765 | 0 | cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth); |
766 | 0 | } |
767 | 0 | } |
768 | | |
769 | | void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx) |
770 | 0 | { |
771 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
772 | |
|
773 | 0 | if (tuDepth == cu.m_tuDepth[absPartIdx]) |
774 | 0 | { |
775 | 0 | uint32_t qtLayer = log2TrSize - 2; |
776 | | |
777 | | // copy transform coefficients |
778 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
779 | 0 | coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
780 | 0 | coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY; |
781 | 0 | memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2)); |
782 | | |
783 | | // copy reconstruction |
784 | 0 | m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize); |
785 | 0 | } |
786 | 0 | else |
787 | 0 | { |
788 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
789 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
790 | 0 | extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx); |
791 | 0 | } |
792 | 0 | } |
793 | | |
794 | | inline void offsetCBFs(uint8_t subTUCBF[2]) |
795 | 0 | { |
796 | 0 | uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1]; |
797 | 0 | subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF; |
798 | 0 | subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF; |
799 | 0 | } |
800 | | |
801 | | /* 4:2:2 post-TU split processing */ |
802 | | void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx) |
803 | 0 | { |
804 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
805 | |
|
806 | 0 | if (log2TrSize == 2) |
807 | 0 | { |
808 | 0 | X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
809 | 0 | ++log2TrSize; |
810 | 0 | } |
811 | |
|
812 | 0 | uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1); |
813 | | |
814 | | // move the CBFs down a level and set the parent CBF |
815 | 0 | uint8_t subTUCBF[2]; |
816 | 0 | subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth); |
817 | 0 | subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth); |
818 | 0 | offsetCBFs(subTUCBF); |
819 | |
|
820 | 0 | cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts); |
821 | 0 | cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts); |
822 | 0 | } |
823 | | |
824 | | /* returns distortion */ |
825 | | void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost) |
826 | 0 | { |
827 | 0 | CUData& cu = mode.cu; |
828 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
829 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
830 | |
|
831 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
832 | 0 | { |
833 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
834 | 0 | uint32_t splitCbfU = 0, splitCbfV = 0; |
835 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
836 | 0 | { |
837 | 0 | codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost); |
838 | 0 | splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
839 | 0 | splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
840 | 0 | } |
841 | 0 | cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth); |
842 | 0 | cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth); |
843 | |
|
844 | 0 | return; |
845 | 0 | } |
846 | | |
847 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
848 | 0 | uint32_t tuDepthC = tuDepth; |
849 | 0 | if (log2TrSizeC < 2) |
850 | 0 | { |
851 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
852 | 0 | if (absPartIdx & 3) |
853 | 0 | return; |
854 | 0 | log2TrSizeC = 2; |
855 | 0 | tuDepthC--; |
856 | 0 | } |
857 | | |
858 | 0 | if (bEnableRDOQ) |
859 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); |
860 | |
|
861 | 0 | bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; |
862 | 0 | checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]); |
863 | 0 | if (checkTransformSkip) |
864 | 0 | { |
865 | 0 | codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost); |
866 | 0 | return; |
867 | 0 | } |
868 | | |
869 | 0 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; |
870 | 0 | uint32_t qtLayer = log2TrSize - 2; |
871 | 0 | uint32_t stride = mode.fencYuv->m_csize; |
872 | 0 | const uint32_t sizeIdxC = log2TrSizeC - 2; |
873 | |
|
874 | 0 | uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; |
875 | 0 | const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; |
876 | |
|
877 | 0 | TURecurse tuIterator(splitType, curPartNum, absPartIdx); |
878 | 0 | do |
879 | 0 | { |
880 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
881 | |
|
882 | 0 | IntraNeighbors intraNeighbors; |
883 | 0 | initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors); |
884 | |
|
885 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
886 | 0 | { |
887 | 0 | TextType ttype = (TextType)chromaId; |
888 | |
|
889 | 0 | const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); |
890 | 0 | pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); |
891 | 0 | int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
892 | 0 | uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
893 | 0 | coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
894 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); |
895 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; |
896 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
897 | 0 | pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); |
898 | 0 | intptr_t picStride = reconPic->m_strideC; |
899 | |
|
900 | 0 | uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; |
901 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
902 | 0 | chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; |
903 | 0 | if (m_csp == X265_CSP_I422) |
904 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
905 | | |
906 | | // init availability pattern |
907 | 0 | initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); |
908 | | |
909 | | // get prediction signal |
910 | 0 | predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); |
911 | 0 | cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
912 | |
|
913 | 0 | primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
914 | |
|
915 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); |
916 | 0 | if (numSig) |
917 | 0 | { |
918 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); |
919 | 0 | bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
920 | 0 | bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
921 | 0 | bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
922 | 0 | bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0); |
923 | 0 | primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride); |
924 | 0 | cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
925 | 0 | } |
926 | 0 | else |
927 | 0 | { |
928 | | // no coded residual, recon = pred |
929 | 0 | primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, pred, stride); |
930 | 0 | cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
931 | 0 | } |
932 | |
|
933 | 0 | outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride)); |
934 | |
|
935 | 0 | if (m_rdCost.m_psyRd) |
936 | 0 | outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride); |
937 | 0 | else if(m_rdCost.m_ssimRd) |
938 | 0 | outCost.energy += m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC); |
939 | |
|
940 | 0 | primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride); |
941 | 0 | } |
942 | 0 | } |
943 | 0 | while (tuIterator.isNextSection()); |
944 | |
|
945 | 0 | if (splitType == VERTICAL_SPLIT) |
946 | 0 | { |
947 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
948 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
949 | 0 | } |
950 | 0 | } |
951 | | |
952 | | /* returns distortion */ |
953 | | void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost) |
954 | 0 | { |
955 | 0 | CUData& cu = mode.cu; |
956 | 0 | uint32_t fullDepth = cuGeom.depth + tuDepth; |
957 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
958 | 0 | const uint32_t log2TrSizeC = 2; |
959 | 0 | uint32_t qtLayer = log2TrSize - 2; |
960 | | |
961 | | /* At the TU layers above this one, no RDO is performed, only distortion is being measured, |
962 | | * so the entropy coder is not very accurate. The best we can do is return it in the same |
963 | | * condition as it arrived, and to do all bit estimates from the same state. */ |
964 | 0 | m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); |
965 | |
|
966 | 0 | uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; |
967 | 0 | const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; |
968 | |
|
969 | 0 | TURecurse tuIterator(splitType, curPartNum, absPartIdx); |
970 | 0 | do |
971 | 0 | { |
972 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
973 | |
|
974 | 0 | IntraNeighbors intraNeighbors; |
975 | 0 | initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors); |
976 | |
|
977 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
978 | 0 | { |
979 | 0 | TextType ttype = (TextType)chromaId; |
980 | |
|
981 | 0 | const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); |
982 | 0 | pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); |
983 | 0 | int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); |
984 | 0 | uint32_t stride = mode.fencYuv->m_csize; |
985 | 0 | const uint32_t sizeIdxC = log2TrSizeC - 2; |
986 | |
|
987 | 0 | uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
988 | 0 | coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
989 | 0 | pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); |
990 | 0 | uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; |
991 | | |
992 | | // init availability pattern |
993 | 0 | initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); |
994 | |
|
995 | 0 | uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; |
996 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
997 | 0 | chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; |
998 | 0 | if (m_csp == X265_CSP_I422) |
999 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
1000 | | |
1001 | | // get prediction signal |
1002 | 0 | predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); |
1003 | |
|
1004 | 0 | uint64_t bCost = MAX_INT64; |
1005 | 0 | sse_t bDist = 0; |
1006 | 0 | uint32_t bCbf = 0; |
1007 | 0 | uint32_t bEnergy = 0; |
1008 | 0 | int bTSkip = 0; |
1009 | |
|
1010 | 0 | int checkTransformSkip = 1; |
1011 | 0 | for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) |
1012 | 0 | { |
1013 | 0 | coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC); |
1014 | 0 | pixel* recon = (useTSkip ? m_tsRecon : reconQt); |
1015 | 0 | uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); |
1016 | |
|
1017 | 0 | primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
1018 | |
|
1019 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip); |
1020 | 0 | if (numSig) |
1021 | 0 | { |
1022 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig); |
1023 | 0 | bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0; |
1024 | 0 | bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
1025 | 0 | bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
1026 | 0 | bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0); |
1027 | 0 | primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride); |
1028 | 0 | cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1029 | 0 | } |
1030 | 0 | else if (useTSkip) |
1031 | 0 | { |
1032 | 0 | checkTransformSkip = 0; |
1033 | 0 | break; |
1034 | 0 | } |
1035 | 0 | else |
1036 | 0 | { |
1037 | 0 | primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride); |
1038 | 0 | cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1039 | 0 | } |
1040 | 0 | sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride); |
1041 | 0 | tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist); |
1042 | |
|
1043 | 0 | cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1044 | |
|
1045 | 0 | uint32_t tmpBits = 0, tmpEnergy = 0; |
1046 | 0 | if (numSig) |
1047 | 0 | { |
1048 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); |
1049 | 0 | m_entropyCoder.resetBits(); |
1050 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId); |
1051 | 0 | tmpBits = m_entropyCoder.getNumberOfWrittenBits(); |
1052 | 0 | } |
1053 | |
|
1054 | 0 | uint64_t tmpCost; |
1055 | 0 | if (m_rdCost.m_psyRd) |
1056 | 0 | { |
1057 | 0 | tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride); |
1058 | 0 | tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy); |
1059 | 0 | } |
1060 | 0 | else if(m_rdCost.m_ssimRd) |
1061 | 0 | { |
1062 | 0 | tmpEnergy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSizeC, ttype, absPartIdxC); |
1063 | 0 | tmpCost = m_rdCost.calcSsimRdCost(tmpDist, tmpBits, tmpEnergy); |
1064 | 0 | } |
1065 | 0 | else |
1066 | 0 | tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits); |
1067 | |
|
1068 | 0 | if (tmpCost < bCost) |
1069 | 0 | { |
1070 | 0 | bCost = tmpCost; |
1071 | 0 | bDist = tmpDist; |
1072 | 0 | bTSkip = useTSkip; |
1073 | 0 | bCbf = !!numSig; |
1074 | 0 | bEnergy = tmpEnergy; |
1075 | 0 | } |
1076 | 0 | } |
1077 | |
|
1078 | 0 | if (bTSkip) |
1079 | 0 | { |
1080 | 0 | memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2)); |
1081 | 0 | primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE); |
1082 | 0 | } |
1083 | |
|
1084 | 0 | cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1085 | 0 | cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1086 | |
|
1087 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
1088 | 0 | pixel* reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); |
1089 | 0 | intptr_t picStride = reconPic->m_strideC; |
1090 | 0 | primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride); |
1091 | |
|
1092 | 0 | outCost.distortion += bDist; |
1093 | 0 | outCost.energy += bEnergy; |
1094 | 0 | } |
1095 | 0 | } |
1096 | 0 | while (tuIterator.isNextSection()); |
1097 | |
|
1098 | 0 | if (splitType == VERTICAL_SPLIT) |
1099 | 0 | { |
1100 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
1101 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
1102 | 0 | } |
1103 | |
|
1104 | 0 | m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); |
1105 | 0 | } |
1106 | | |
1107 | | void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth) |
1108 | 0 | { |
1109 | 0 | uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; |
1110 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
1111 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
1112 | |
|
1113 | 0 | if (tuDepthL == tuDepth || log2TrSizeC == 2) |
1114 | 0 | { |
1115 | | // copy transform coefficients |
1116 | 0 | uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); |
1117 | 0 | uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
1118 | |
|
1119 | 0 | uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth); |
1120 | 0 | coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; |
1121 | 0 | coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; |
1122 | 0 | coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; |
1123 | 0 | coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC; |
1124 | 0 | memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC); |
1125 | 0 | memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC); |
1126 | | |
1127 | | // copy reconstruction |
1128 | 0 | m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift); |
1129 | 0 | } |
1130 | 0 | else |
1131 | 0 | { |
1132 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
1133 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
1134 | 0 | extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1); |
1135 | 0 | } |
1136 | 0 | } |
1137 | | |
1138 | | void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth) |
1139 | 0 | { |
1140 | 0 | CUData& cu = mode.cu; |
1141 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[absPartIdx] - tuDepth; |
1142 | |
|
1143 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
1144 | 0 | { |
1145 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
1146 | 0 | uint32_t splitCbfU = 0, splitCbfV = 0; |
1147 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
1148 | 0 | { |
1149 | 0 | residualQTIntraChroma(mode, cuGeom, qPartIdx, tuDepth + 1); |
1150 | 0 | splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
1151 | 0 | splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
1152 | 0 | } |
1153 | 0 | cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth); |
1154 | 0 | cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth); |
1155 | |
|
1156 | 0 | return; |
1157 | 0 | } |
1158 | | |
1159 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
1160 | 0 | uint32_t tuDepthC = tuDepth; |
1161 | 0 | if (log2TrSizeC < 2) |
1162 | 0 | { |
1163 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
1164 | 0 | if (absPartIdx & 3) |
1165 | 0 | return; |
1166 | 0 | log2TrSizeC = 2; |
1167 | 0 | tuDepthC--; |
1168 | 0 | } |
1169 | | |
1170 | 0 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; |
1171 | 0 | uint32_t stride = mode.fencYuv->m_csize; |
1172 | 0 | const uint32_t sizeIdxC = log2TrSizeC - 2; |
1173 | |
|
1174 | 0 | uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; |
1175 | 0 | const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; |
1176 | |
|
1177 | 0 | TURecurse tuIterator(splitType, curPartNum, absPartIdx); |
1178 | 0 | do |
1179 | 0 | { |
1180 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
1181 | |
|
1182 | 0 | IntraNeighbors intraNeighbors; |
1183 | 0 | initIntraNeighbors(cu, absPartIdxC, tuDepthC, false, &intraNeighbors); |
1184 | |
|
1185 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
1186 | 0 | { |
1187 | 0 | TextType ttype = (TextType)chromaId; |
1188 | |
|
1189 | 0 | const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); |
1190 | 0 | pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); |
1191 | 0 | int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
1192 | 0 | uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); |
1193 | 0 | coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC; |
1194 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
1195 | 0 | pixel* picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); |
1196 | 0 | intptr_t picStride = reconPic->m_strideC; |
1197 | |
|
1198 | 0 | uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; |
1199 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
1200 | 0 | chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; |
1201 | 0 | if (m_csp == X265_CSP_I422) |
1202 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
1203 | | |
1204 | | // init availability pattern |
1205 | 0 | initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); |
1206 | | |
1207 | | // get prediction signal |
1208 | 0 | predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); |
1209 | |
|
1210 | 0 | X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n"); |
1211 | |
|
1212 | 0 | primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride); |
1213 | |
|
1214 | 0 | uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); |
1215 | 0 | if (numSig) |
1216 | 0 | { |
1217 | 0 | m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); |
1218 | 0 | bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0; |
1219 | 0 | bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
1220 | 0 | bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0; |
1221 | 0 | bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0); |
1222 | 0 | primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride); |
1223 | 0 | cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1224 | 0 | } |
1225 | 0 | else |
1226 | 0 | { |
1227 | | // no coded residual, recon = pred |
1228 | 0 | primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, pred, stride); |
1229 | 0 | cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); |
1230 | 0 | } |
1231 | 0 | } |
1232 | 0 | } |
1233 | 0 | while (tuIterator.isNextSection()); |
1234 | |
|
1235 | 0 | if (splitType == VERTICAL_SPLIT) |
1236 | 0 | { |
1237 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
1238 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
1239 | 0 | } |
1240 | 0 | } |
1241 | | |
1242 | | void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize) |
1243 | 0 | { |
1244 | 0 | CUData& cu = intraMode.cu; |
1245 | |
|
1246 | 0 | cu.setPartSizeSubParts(partSize); |
1247 | 0 | cu.setPredModeSubParts(MODE_INTRA); |
1248 | |
|
1249 | 0 | uint32_t tuDepthRange[2]; |
1250 | 0 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); |
1251 | |
|
1252 | 0 | intraMode.initCosts(); |
1253 | 0 | intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange); |
1254 | 0 | if (m_csp != X265_CSP_I400) |
1255 | 0 | { |
1256 | 0 | intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom); |
1257 | 0 | intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion; |
1258 | 0 | } |
1259 | 0 | else |
1260 | 0 | intraMode.distortion += intraMode.lumaDistortion; |
1261 | 0 | cu.m_distortion[0] = intraMode.distortion; |
1262 | 0 | m_entropyCoder.resetBits(); |
1263 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
1264 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
1265 | |
|
1266 | 0 | int skipFlagBits = 0; |
1267 | 0 | if (!m_slice->isIntra()) |
1268 | 0 | { |
1269 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
1270 | 0 | skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
1271 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
1272 | 0 | } |
1273 | |
|
1274 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
1275 | 0 | m_entropyCoder.codePredInfo(cu, 0); |
1276 | 0 | intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
1277 | |
|
1278 | 0 | bool bCodeDQP = m_slice->m_pps->bUseDQP; |
1279 | 0 | m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); |
1280 | 0 | m_entropyCoder.store(intraMode.contexts); |
1281 | 0 | intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); |
1282 | 0 | intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits; |
1283 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1284 | 0 | if (m_rdCost.m_psyRd) |
1285 | 0 | intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size); |
1286 | 0 | else if(m_rdCost.m_ssimRd) |
1287 | 0 | intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0); |
1288 | |
|
1289 | 0 | intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size); |
1290 | |
|
1291 | 0 | updateModeCost(intraMode); |
1292 | 0 | checkDQP(intraMode, cuGeom); |
1293 | |
|
1294 | | #if ENABLE_SCC_EXT |
1295 | | if (m_param->bEnableSCC) |
1296 | | intraMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx); |
1297 | | #endif |
1298 | 0 | } |
1299 | | |
1300 | | /* Note that this function does not save the best intra prediction, it must |
1301 | | * be generated later. It records the best mode in the cu */ |
1302 | | void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom) |
1303 | 0 | { |
1304 | 0 | ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis); |
1305 | |
|
1306 | 0 | CUData& cu = intraMode.cu; |
1307 | 0 | uint32_t depth = cuGeom.depth; |
1308 | |
|
1309 | 0 | cu.setPartSizeSubParts(SIZE_2Nx2N); |
1310 | 0 | cu.setPredModeSubParts(MODE_INTRA); |
1311 | |
|
1312 | 0 | const uint32_t initTuDepth = 0; |
1313 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth; |
1314 | 0 | uint32_t tuSize = 1 << log2TrSize; |
1315 | 0 | const uint32_t absPartIdx = 0; |
1316 | | |
1317 | | // Reference sample smoothing |
1318 | 0 | IntraNeighbors intraNeighbors; |
1319 | 0 | initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors); |
1320 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX); |
1321 | |
|
1322 | 0 | const pixel* fenc = intraMode.fencYuv->m_buf[0]; |
1323 | 0 | uint32_t stride = intraMode.fencYuv->m_size; |
1324 | |
|
1325 | 0 | int sad, bsad; |
1326 | 0 | uint32_t bits, bbits, mode, bmode; |
1327 | 0 | uint64_t cost, bcost; |
1328 | | |
1329 | | // 33 Angle modes once |
1330 | 0 | int scaleTuSize = tuSize; |
1331 | 0 | int scaleStride = stride; |
1332 | 0 | int costShift = 0; |
1333 | 0 | int sizeIdx = log2TrSize - 2; |
1334 | |
|
1335 | 0 | if (tuSize > 32) |
1336 | 0 | { |
1337 | | // CU is 64x64, we scale to 32x32 and adjust required parameters |
1338 | 0 | primitives.scale2D_64to32(m_fencScaled, fenc, stride); |
1339 | 0 | fenc = m_fencScaled; |
1340 | |
|
1341 | 0 | pixel nScale[129]; |
1342 | 0 | intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0]; |
1343 | 0 | primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1); |
1344 | | |
1345 | | // we do not estimate filtering for downscaled samples |
1346 | 0 | memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel)); // Top & Left pixels |
1347 | 0 | memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel)); |
1348 | |
|
1349 | 0 | scaleTuSize = 32; |
1350 | 0 | scaleStride = 32; |
1351 | 0 | costShift = 2; |
1352 | 0 | sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 |
1353 | 0 | } |
1354 | |
|
1355 | 0 | pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d; |
1356 | 0 | int predsize = scaleTuSize * scaleTuSize; |
1357 | |
|
1358 | 0 | m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); |
1359 | | |
1360 | | /* there are three cost tiers for intra modes: |
1361 | | * pred[0] - mode probable, least cost |
1362 | | * pred[1], pred[2] - less probable, slightly more cost |
1363 | | * non-mpm modes - all cost the same (rbits) */ |
1364 | 0 | uint64_t mpms; |
1365 | 0 | uint32_t mpmModes[3]; |
1366 | 0 | uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms); |
1367 | | |
1368 | | // DC |
1369 | 0 | primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); |
1370 | 0 | bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift; |
1371 | 0 | bmode = mode = DC_IDX; |
1372 | 0 | bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1373 | 0 | bcost = m_rdCost.calcRdSADCost(bsad, bbits); |
1374 | | |
1375 | | // PLANAR |
1376 | 0 | pixel* planar = intraNeighbourBuf[0]; |
1377 | 0 | if (tuSize & (8 | 16 | 32)) |
1378 | 0 | planar = intraNeighbourBuf[1]; |
1379 | |
|
1380 | 0 | primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0); |
1381 | 0 | sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift; |
1382 | 0 | mode = PLANAR_IDX; |
1383 | 0 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1384 | 0 | cost = m_rdCost.calcRdSADCost(sad, bits); |
1385 | 0 | COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); |
1386 | |
|
1387 | 0 | bool allangs = true; |
1388 | 0 | if (primitives.cu[sizeIdx].intra_pred_allangs) |
1389 | 0 | { |
1390 | 0 | primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride); |
1391 | 0 | primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); |
1392 | 0 | } |
1393 | 0 | else |
1394 | 0 | allangs = false; |
1395 | |
|
1396 | 0 | #define TRY_ANGLE(angle) \ |
1397 | 0 | if (allangs) { \ |
1398 | 0 | if (angle < 18) \ |
1399 | 0 | sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \ |
1400 | 0 | else \ |
1401 | 0 | sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \ |
1402 | 0 | bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \ |
1403 | 0 | cost = m_rdCost.calcRdSADCost(sad, bits); \ |
1404 | 0 | } else { \ |
1405 | 0 | int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \ |
1406 | 0 | primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \ |
1407 | 0 | sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \ |
1408 | 0 | bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \ |
1409 | 0 | cost = m_rdCost.calcRdSADCost(sad, bits); \ |
1410 | 0 | } |
1411 | |
|
1412 | 0 | if (m_param->bEnableFastIntra) |
1413 | 0 | { |
1414 | 0 | int asad = 0; |
1415 | 0 | uint32_t lowmode, highmode, amode = 5, abits = 0; |
1416 | 0 | uint64_t acost = MAX_INT64; |
1417 | | |
1418 | | /* pick the best angle, sampling at distance of 5 */ |
1419 | 0 | for (mode = 5; mode < 35; mode += 5) |
1420 | 0 | { |
1421 | 0 | TRY_ANGLE(mode); |
1422 | 0 | COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); |
1423 | 0 | } |
1424 | | |
1425 | | /* refine best angle at distance 2, then distance 1 */ |
1426 | 0 | for (uint32_t dist = 2; dist >= 1; dist--) |
1427 | 0 | { |
1428 | 0 | lowmode = amode - dist; |
1429 | 0 | highmode = amode + dist; |
1430 | |
|
1431 | 0 | X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); |
1432 | 0 | TRY_ANGLE(lowmode); |
1433 | 0 | COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); |
1434 | |
|
1435 | 0 | X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); |
1436 | 0 | TRY_ANGLE(highmode); |
1437 | 0 | COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); |
1438 | 0 | } |
1439 | |
|
1440 | 0 | if (amode == 33) |
1441 | 0 | { |
1442 | 0 | TRY_ANGLE(34); |
1443 | 0 | COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); |
1444 | 0 | } |
1445 | |
|
1446 | 0 | COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); |
1447 | 0 | } |
1448 | 0 | else // calculate and search all intra prediction angles for lowest cost |
1449 | 0 | { |
1450 | 0 | for (mode = 2; mode < 35; mode++) |
1451 | 0 | { |
1452 | 0 | TRY_ANGLE(mode); |
1453 | 0 | COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); |
1454 | 0 | } |
1455 | 0 | } |
1456 | |
|
1457 | 0 | cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth); |
1458 | 0 | intraMode.initCosts(); |
1459 | 0 | intraMode.totalBits = bbits; |
1460 | 0 | intraMode.distortion = bsad; |
1461 | 0 | intraMode.sa8dCost = bcost; |
1462 | 0 | intraMode.sa8dBits = bbits; |
1463 | 0 | } |
1464 | | |
1465 | | void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) |
1466 | 0 | { |
1467 | 0 | ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); |
1468 | |
|
1469 | 0 | CUData& cu = intraMode.cu; |
1470 | 0 | Yuv* reconYuv = &intraMode.reconYuv; |
1471 | |
|
1472 | 0 | X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); |
1473 | 0 | X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); |
1474 | |
|
1475 | 0 | uint32_t tuDepthRange[2]; |
1476 | 0 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); |
1477 | |
|
1478 | 0 | m_entropyCoder.load(m_rqt[cuGeom.depth].cur); |
1479 | |
|
1480 | 0 | Cost icosts; |
1481 | 0 | codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); |
1482 | 0 | extractIntraResultQT(cu, *reconYuv, 0, 0); |
1483 | |
|
1484 | 0 | intraMode.lumaDistortion = icosts.distortion; |
1485 | 0 | if (m_csp != X265_CSP_I400) |
1486 | 0 | { |
1487 | 0 | intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom); |
1488 | 0 | intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion; |
1489 | 0 | } |
1490 | 0 | else |
1491 | 0 | intraMode.distortion = intraMode.lumaDistortion; |
1492 | |
|
1493 | 0 | m_entropyCoder.resetBits(); |
1494 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
1495 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
1496 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
1497 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
1498 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
1499 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
1500 | 0 | m_entropyCoder.codePredInfo(cu, 0); |
1501 | 0 | intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
1502 | |
|
1503 | 0 | bool bCodeDQP = m_slice->m_pps->bUseDQP; |
1504 | 0 | m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); |
1505 | |
|
1506 | 0 | intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); |
1507 | 0 | intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits; |
1508 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1509 | 0 | if (m_rdCost.m_psyRd) |
1510 | 0 | intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
1511 | 0 | else if(m_rdCost.m_ssimRd) |
1512 | 0 | intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cuGeom.log2CUSize, TEXT_LUMA, 0); |
1513 | |
|
1514 | 0 | intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size); |
1515 | 0 | m_entropyCoder.store(intraMode.contexts); |
1516 | 0 | updateModeCost(intraMode); |
1517 | 0 | checkDQP(intraMode, cuGeom); |
1518 | 0 | } |
1519 | | |
1520 | | sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]) |
1521 | 0 | { |
1522 | 0 | CUData& cu = intraMode.cu; |
1523 | 0 | Yuv* reconYuv = &intraMode.reconYuv; |
1524 | 0 | Yuv* predYuv = &intraMode.predYuv; |
1525 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1526 | |
|
1527 | 0 | uint32_t depth = cuGeom.depth; |
1528 | 0 | uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; |
1529 | 0 | uint32_t numPU = 1 << (2 * initTuDepth); |
1530 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth; |
1531 | 0 | uint32_t tuSize = 1 << log2TrSize; |
1532 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
1533 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
1534 | 0 | uint32_t absPartIdx = 0; |
1535 | 0 | sse_t totalDistortion = 0; |
1536 | |
|
1537 | 0 | int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N; |
1538 | | |
1539 | | // loop over partitions |
1540 | 0 | for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts) |
1541 | 0 | { |
1542 | 0 | uint32_t bmode = 0; |
1543 | |
|
1544 | 0 | if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX) |
1545 | 0 | bmode = intraMode.cu.m_lumaIntraDir[puIdx]; |
1546 | 0 | else |
1547 | 0 | { |
1548 | 0 | uint64_t candCostList[MAX_RD_INTRA_MODES]; |
1549 | 0 | uint32_t rdModeList[MAX_RD_INTRA_MODES]; |
1550 | 0 | uint64_t bcost; |
1551 | 0 | int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1); |
1552 | |
|
1553 | 0 | { |
1554 | 0 | ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis); |
1555 | | |
1556 | | // Reference sample smoothing |
1557 | 0 | IntraNeighbors intraNeighbors; |
1558 | 0 | initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors); |
1559 | 0 | initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX); |
1560 | | |
1561 | | // determine set of modes to be tested (using prediction signal only) |
1562 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
1563 | 0 | uint32_t stride = predYuv->m_size; |
1564 | |
|
1565 | 0 | int scaleTuSize = tuSize; |
1566 | 0 | int scaleStride = stride; |
1567 | 0 | int costShift = 0; |
1568 | |
|
1569 | 0 | m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); |
1570 | | |
1571 | | /* there are three cost tiers for intra modes: |
1572 | | * pred[0] - mode probable, least cost |
1573 | | * pred[1], pred[2] - less probable, slightly more cost |
1574 | | * non-mpm modes - all cost the same (rbits) */ |
1575 | 0 | uint64_t mpms; |
1576 | 0 | uint32_t mpmModes[3]; |
1577 | 0 | uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms); |
1578 | |
|
1579 | 0 | pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d; |
1580 | 0 | uint64_t modeCosts[35]; |
1581 | | |
1582 | | // DC |
1583 | 0 | primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); |
1584 | 0 | uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits; |
1585 | 0 | uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift; |
1586 | 0 | modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); |
1587 | | |
1588 | | // PLANAR |
1589 | 0 | pixel* planar = intraNeighbourBuf[0]; |
1590 | 0 | if (tuSize >= 8 && tuSize <= 32) |
1591 | 0 | planar = intraNeighbourBuf[1]; |
1592 | |
|
1593 | 0 | primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0); |
1594 | 0 | bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits; |
1595 | 0 | sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift; |
1596 | 0 | modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits); |
1597 | 0 | COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]); |
1598 | | |
1599 | | // angular predictions |
1600 | 0 | if (primitives.cu[sizeIdx].intra_pred_allangs) |
1601 | 0 | { |
1602 | 0 | primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride); |
1603 | 0 | primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); |
1604 | 0 | for (int mode = 2; mode < 35; mode++) |
1605 | 0 | { |
1606 | 0 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1607 | 0 | if (mode < 18) |
1608 | 0 | sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; |
1609 | 0 | else |
1610 | 0 | sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; |
1611 | 0 | modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); |
1612 | 0 | COPY1_IF_LT(bcost, modeCosts[mode]); |
1613 | 0 | } |
1614 | 0 | } |
1615 | 0 | else |
1616 | 0 | { |
1617 | 0 | for (int mode = 2; mode < 35; mode++) |
1618 | 0 | { |
1619 | 0 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; |
1620 | 0 | int filter = !!(g_intraFilterFlags[mode] & scaleTuSize); |
1621 | 0 | primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16); |
1622 | 0 | sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift; |
1623 | 0 | modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); |
1624 | 0 | COPY1_IF_LT(bcost, modeCosts[mode]); |
1625 | 0 | } |
1626 | 0 | } |
1627 | | |
1628 | | /* Find the top maxCandCount candidate modes with cost within 25% of best |
1629 | | * or among the most probable modes. maxCandCount is derived from the |
1630 | | * rdLevel and depth. In general we want to try more modes at slower RD |
1631 | | * levels and at higher depths */ |
1632 | 0 | for (int i = 0; i < maxCandCount; i++) |
1633 | 0 | candCostList[i] = MAX_INT64; |
1634 | |
|
1635 | 0 | uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25% |
1636 | 0 | for (int mode = 0; mode < 35; mode++) |
1637 | 0 | if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) |
1638 | | /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */ |
1639 | 0 | updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList); |
1640 | 0 | } |
1641 | | |
1642 | | /* measure best candidates using simple RDO (no TU splits) */ |
1643 | 0 | bcost = MAX_INT64; |
1644 | 0 | for (int i = 0; i < maxCandCount; i++) |
1645 | 0 | { |
1646 | 0 | if (candCostList[i] == MAX_INT64) |
1647 | 0 | break; |
1648 | | |
1649 | 0 | ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); |
1650 | |
|
1651 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1652 | 0 | cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth); |
1653 | |
|
1654 | 0 | Cost icosts; |
1655 | 0 | if (checkTransformSkip) |
1656 | 0 | codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); |
1657 | 0 | else |
1658 | 0 | codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange); |
1659 | 0 | COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]); |
1660 | 0 | } |
1661 | 0 | } |
1662 | |
|
1663 | 0 | ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); |
1664 | | |
1665 | | /* remeasure best mode, allowing TU splits */ |
1666 | 0 | cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth); |
1667 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1668 | |
|
1669 | 0 | Cost icosts; |
1670 | 0 | if (checkTransformSkip) |
1671 | 0 | codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); |
1672 | 0 | else |
1673 | 0 | codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange); |
1674 | 0 | totalDistortion += icosts.distortion; |
1675 | |
|
1676 | 0 | extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx); |
1677 | | |
1678 | | // set reconstruction for next intra prediction blocks |
1679 | 0 | if (puIdx != numPU - 1) |
1680 | 0 | { |
1681 | | /* This has important implications for parallelism and RDO. It is writing intermediate results into the |
1682 | | * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also |
1683 | | * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think |
1684 | | * that the contexts should be tracked through each PU */ |
1685 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
1686 | 0 | pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); |
1687 | 0 | uint32_t dststride = reconPic->m_stride; |
1688 | 0 | const pixel* src = reconYuv->getLumaAddr(absPartIdx); |
1689 | 0 | uint32_t srcstride = reconYuv->m_size; |
1690 | 0 | primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride); |
1691 | 0 | } |
1692 | 0 | } |
1693 | |
|
1694 | 0 | if (numPU > 1) |
1695 | 0 | { |
1696 | 0 | uint32_t combCbfY = 0; |
1697 | 0 | for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
1698 | 0 | combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1); |
1699 | |
|
1700 | 0 | cu.m_cbf[0][0] |= combCbfY; |
1701 | 0 | } |
1702 | | |
1703 | | // TODO: remove this |
1704 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1705 | |
|
1706 | 0 | return totalDistortion; |
1707 | 0 | } |
1708 | | |
1709 | | void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) |
1710 | 0 | { |
1711 | 0 | CUData& cu = intraMode.cu; |
1712 | 0 | const Yuv* fencYuv = intraMode.fencYuv; |
1713 | 0 | Yuv* predYuv = &intraMode.predYuv; |
1714 | |
|
1715 | 0 | uint32_t bestMode = 0; |
1716 | 0 | uint64_t bestCost = MAX_INT64; |
1717 | 0 | uint32_t modeList[NUM_CHROMA_MODE]; |
1718 | |
|
1719 | 0 | uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift; |
1720 | 0 | uint32_t tuSize = 1 << log2TrSizeC; |
1721 | 0 | uint32_t tuDepth = 0; |
1722 | 0 | int32_t costShift = 0; |
1723 | |
|
1724 | 0 | if (tuSize > 32) |
1725 | 0 | { |
1726 | 0 | tuDepth = 1; |
1727 | 0 | costShift = 2; |
1728 | 0 | log2TrSizeC = 5; |
1729 | 0 | } |
1730 | |
|
1731 | 0 | IntraNeighbors intraNeighbors; |
1732 | 0 | initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors); |
1733 | 0 | cu.getAllowedChromaDir(0, modeList); |
1734 | | |
1735 | | // check chroma modes |
1736 | 0 | for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++) |
1737 | 0 | { |
1738 | 0 | uint32_t chromaPredMode = modeList[mode]; |
1739 | 0 | if (chromaPredMode == DM_CHROMA_IDX) |
1740 | 0 | chromaPredMode = cu.m_lumaIntraDir[0]; |
1741 | 0 | if (m_csp == X265_CSP_I422) |
1742 | 0 | chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; |
1743 | |
|
1744 | 0 | uint64_t cost = 0; |
1745 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
1746 | 0 | { |
1747 | 0 | const pixel* fenc = fencYuv->m_buf[chromaId]; |
1748 | 0 | pixel* pred = predYuv->m_buf[chromaId]; |
1749 | 0 | Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId); |
1750 | | // get prediction signal |
1751 | 0 | predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC); |
1752 | 0 | cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift; |
1753 | 0 | } |
1754 | |
|
1755 | 0 | if (cost < bestCost) |
1756 | 0 | { |
1757 | 0 | bestCost = cost; |
1758 | 0 | bestMode = modeList[mode]; |
1759 | 0 | } |
1760 | 0 | } |
1761 | |
|
1762 | 0 | cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth); |
1763 | 0 | } |
1764 | | |
1765 | | sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) |
1766 | 0 | { |
1767 | 0 | CUData& cu = intraMode.cu; |
1768 | 0 | Yuv& reconYuv = intraMode.reconYuv; |
1769 | |
|
1770 | 0 | uint32_t depth = cuGeom.depth; |
1771 | 0 | uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444; |
1772 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth; |
1773 | 0 | uint32_t absPartStep = cuGeom.numPartitions; |
1774 | 0 | sse_t totalDistortion = 0; |
1775 | |
|
1776 | 0 | int size = partitionFromLog2Size(log2TrSize); |
1777 | |
|
1778 | 0 | TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); |
1779 | |
|
1780 | 0 | do |
1781 | 0 | { |
1782 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
1783 | |
|
1784 | 0 | uint32_t bestMode = 0; |
1785 | 0 | sse_t bestDist = 0; |
1786 | 0 | uint64_t bestCost = MAX_INT64; |
1787 | | |
1788 | | // init mode list |
1789 | 0 | uint32_t minMode = 0; |
1790 | 0 | uint32_t maxMode = NUM_CHROMA_MODE; |
1791 | 0 | uint32_t modeList[NUM_CHROMA_MODE]; |
1792 | |
|
1793 | 0 | if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth) |
1794 | 0 | { |
1795 | 0 | for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++) |
1796 | 0 | modeList[l] = intraMode.cu.m_chromaIntraDir[0]; |
1797 | 0 | maxMode = 1; |
1798 | 0 | } |
1799 | 0 | else |
1800 | 0 | cu.getAllowedChromaDir(absPartIdxC, modeList); |
1801 | |
|
1802 | 0 | if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400) |
1803 | 0 | { |
1804 | 0 | for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++) |
1805 | 0 | modeList[l] = modeList[0]; |
1806 | 0 | maxMode = 1; |
1807 | 0 | } |
1808 | | // check chroma modes |
1809 | 0 | for (uint32_t mode = minMode; mode < maxMode; mode++) |
1810 | 0 | { |
1811 | | // restore context models |
1812 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1813 | |
|
1814 | 0 | cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth); |
1815 | 0 | Cost outCost; |
1816 | 0 | codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost); |
1817 | |
|
1818 | 0 | if (m_slice->m_pps->bTransformSkipEnabled) |
1819 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1820 | |
|
1821 | 0 | m_entropyCoder.resetBits(); |
1822 | | // chroma prediction mode |
1823 | 0 | if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444) |
1824 | 0 | { |
1825 | 0 | if (!absPartIdxC) |
1826 | 0 | m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); |
1827 | 0 | } |
1828 | 0 | else |
1829 | 0 | { |
1830 | 0 | uint32_t qNumParts = cuGeom.numPartitions >> 2; |
1831 | 0 | if (!(absPartIdxC & (qNumParts - 1))) |
1832 | 0 | m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); |
1833 | 0 | } |
1834 | |
|
1835 | 0 | codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC); |
1836 | 0 | codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U); |
1837 | 0 | codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V); |
1838 | 0 | uint32_t bits = m_entropyCoder.getNumberOfWrittenBits(); |
1839 | 0 | uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy) : m_rdCost.m_ssimRd ? m_rdCost.calcSsimRdCost(outCost.distortion, bits, outCost.energy) |
1840 | 0 | : m_rdCost.calcRdCost(outCost.distortion, bits); |
1841 | |
|
1842 | 0 | if (cost < bestCost) |
1843 | 0 | { |
1844 | 0 | bestCost = cost; |
1845 | 0 | bestDist = outCost.distortion; |
1846 | 0 | bestMode = modeList[mode]; |
1847 | 0 | extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth); |
1848 | 0 | memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1849 | 0 | memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1850 | 0 | memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1851 | 0 | memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1852 | 0 | } |
1853 | 0 | } |
1854 | |
|
1855 | 0 | if (!tuIterator.isLastSection()) |
1856 | 0 | { |
1857 | 0 | uint32_t zorder = cuGeom.absPartIdx + absPartIdxC; |
1858 | 0 | PicYuv* reconPic = m_frame->m_reconPic[0]; |
1859 | 0 | uint32_t dststride = reconPic->m_strideC; |
1860 | 0 | const pixel* src; |
1861 | 0 | pixel* dst; |
1862 | |
|
1863 | 0 | dst = reconPic->getCbAddr(cu.m_cuAddr, zorder); |
1864 | 0 | src = reconYuv.getCbAddr(absPartIdxC); |
1865 | 0 | primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize); |
1866 | |
|
1867 | 0 | dst = reconPic->getCrAddr(cu.m_cuAddr, zorder); |
1868 | 0 | src = reconYuv.getCrAddr(absPartIdxC); |
1869 | 0 | primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize); |
1870 | 0 | } |
1871 | |
|
1872 | 0 | memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1873 | 0 | memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1874 | 0 | memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1875 | 0 | memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); |
1876 | 0 | cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth); |
1877 | 0 | totalDistortion += bestDist; |
1878 | 0 | } |
1879 | 0 | while (tuIterator.isNextSection()); |
1880 | |
|
1881 | 0 | if (initTuDepth != 0) |
1882 | 0 | { |
1883 | 0 | uint32_t combCbfU = 0; |
1884 | 0 | uint32_t combCbfV = 0; |
1885 | 0 | uint32_t qNumParts = tuIterator.absPartIdxStep; |
1886 | 0 | for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
1887 | 0 | { |
1888 | 0 | combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1); |
1889 | 0 | combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1); |
1890 | 0 | } |
1891 | |
|
1892 | 0 | cu.m_cbf[1][0] |= combCbfU; |
1893 | 0 | cu.m_cbf[2][0] |= combCbfV; |
1894 | 0 | } |
1895 | | |
1896 | | /* TODO: remove this */ |
1897 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
1898 | 0 | return totalDistortion; |
1899 | 0 | } |
1900 | | |
1901 | | /* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */ |
1902 | | uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m) |
1903 | 0 | { |
1904 | 0 | X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n"); |
1905 | |
|
1906 | 0 | MVField candMvField[MRG_MAX_NUM_CANDS][2]; |
1907 | 0 | uint8_t candDir[MRG_MAX_NUM_CANDS]; |
1908 | 0 | uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir); |
1909 | | #if ENABLE_SCC_EXT |
1910 | | restrictBipredMergeCand(&cu, 0, candMvField, candDir, numMergeCand); |
1911 | | #else |
1912 | 0 | if (cu.isBipredRestriction()) |
1913 | 0 | { |
1914 | | /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */ |
1915 | 0 | for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand) |
1916 | 0 | { |
1917 | 0 | if (candDir[mergeCand] == 3) |
1918 | 0 | { |
1919 | 0 | candDir[mergeCand] = 1; |
1920 | 0 | candMvField[mergeCand][1].refIdx = REF_NOT_VALID; |
1921 | 0 | } |
1922 | 0 | } |
1923 | 0 | } |
1924 | 0 | #endif |
1925 | |
|
1926 | 0 | Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv; |
1927 | |
|
1928 | 0 | uint32_t outCost = MAX_UINT; |
1929 | 0 | for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand) |
1930 | 0 | { |
1931 | | /* Prevent TMVP candidates from using unavailable reference pixels */ |
1932 | 0 | if (m_bFrameParallel) |
1933 | 0 | { |
1934 | | // Parallel slices bound check |
1935 | 0 | if (m_param->maxSlices > 1) |
1936 | 0 | { |
1937 | 0 | if (cu.m_bFirstRowInSlice & |
1938 | 0 | ((candMvField[mergeCand][0].mv.y < (2 * 4)) | (candMvField[mergeCand][1].mv.y < (2 * 4)))) |
1939 | 0 | continue; |
1940 | | |
1941 | | // Last row in slice can't reference beyond bound since it is another slice area |
1942 | | // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance |
1943 | 0 | if (cu.m_bLastRowInSlice && |
1944 | 0 | ((candMvField[mergeCand][0].mv.y > -3 * 4) | (candMvField[mergeCand][1].mv.y > -3 * 4))) |
1945 | 0 | continue; |
1946 | 0 | } |
1947 | | |
1948 | 0 | if (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 || |
1949 | 0 | candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4) |
1950 | 0 | continue; |
1951 | 0 | } |
1952 | | |
1953 | | #if ENABLE_SCC_EXT |
1954 | | if ((candDir[mergeCand] == 1 || candDir[mergeCand] == 3) && (m_slice->m_refPOCList[0][candMvField[mergeCand][0].refIdx] == m_slice->m_poc)) |
1955 | | { |
1956 | | continue; |
1957 | | } |
1958 | | #endif |
1959 | 0 | cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv; |
1960 | 0 | cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx; |
1961 | 0 | cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv; |
1962 | 0 | cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx; |
1963 | |
|
1964 | 0 | motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD); |
1965 | |
|
1966 | 0 | uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size); |
1967 | 0 | if (m_me.bChromaSATD) |
1968 | 0 | costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx); |
1969 | |
|
1970 | 0 | uint32_t bitsCand = getTUBits(mergeCand, numMergeCand); |
1971 | 0 | costCand = costCand + m_rdCost.getCost(bitsCand); |
1972 | 0 | if (costCand < outCost) |
1973 | 0 | { |
1974 | 0 | outCost = costCand; |
1975 | 0 | m.bits = bitsCand; |
1976 | 0 | m.index = mergeCand; |
1977 | 0 | } |
1978 | 0 | } |
1979 | |
|
1980 | 0 | m.mvField[0] = candMvField[m.index][0]; |
1981 | 0 | m.mvField[1] = candMvField[m.index][1]; |
1982 | 0 | m.dir = candDir[m.index]; |
1983 | |
|
1984 | 0 | return outCost; |
1985 | 0 | } |
1986 | | |
1987 | | /* find the lowres motion vector from lookahead in middle of current PU */ |
1988 | | MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref) |
1989 | 0 | { |
1990 | 0 | int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]); |
1991 | 0 | if (diffPoc > m_param->bframes + 1) |
1992 | | /* poc difference is out of range for lookahead */ |
1993 | 0 | return 0; |
1994 | | |
1995 | 0 | MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc]; |
1996 | 0 | if (mvs[0].x == 0x7FFF) |
1997 | | /* this motion search was not estimated by lookahead */ |
1998 | 0 | return 0; |
1999 | | |
2000 | 0 | uint32_t block_x = (cu.m_cuPelX + g_zscanToPelX[pu.puAbsPartIdx] + pu.width / 2) >> 4; |
2001 | 0 | uint32_t block_y = (cu.m_cuPelY + g_zscanToPelY[pu.puAbsPartIdx] + pu.height / 2) >> 4; |
2002 | 0 | uint32_t idx = block_y * m_frame->m_lowres.maxBlocksInRow + block_x; |
2003 | |
|
2004 | 0 | X265_CHECK(block_x < m_frame->m_lowres.maxBlocksInRow, "block_x is too high\n"); |
2005 | 0 | X265_CHECK(block_y < m_frame->m_lowres.maxBlocksInCol, "block_y is too high\n"); |
2006 | |
|
2007 | 0 | return mvs[idx] << 1; /* scale up lowres mv */ |
2008 | 0 | } |
2009 | | |
2010 | | /* Pick between the two AMVP candidates which is the best one to use as |
2011 | | * MVP for the motion search, based on SAD cost */ |
2012 | | int Search::selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref) |
2013 | 0 | { |
2014 | 0 | if (amvp[0] == amvp[1]) |
2015 | 0 | return 0; |
2016 | | |
2017 | 0 | Yuv& tmpPredYuv = m_rqt[cu.m_cuDepth[0]].tmpPredYuv; |
2018 | 0 | uint32_t costs[AMVP_NUM_CANDS]; |
2019 | |
|
2020 | 0 | for (int i = 0; i < AMVP_NUM_CANDS; i++) |
2021 | 0 | { |
2022 | 0 | MV mvCand = amvp[i]; |
2023 | | |
2024 | | // NOTE: skip mvCand if Y is > merange and -FN>1 |
2025 | 0 | if (m_bFrameParallel) |
2026 | 0 | { |
2027 | 0 | costs[i] = m_me.COST_MAX; |
2028 | |
|
2029 | 0 | if (mvCand.y >= (m_param->searchRange + 1) * 4) |
2030 | 0 | continue; |
2031 | | |
2032 | 0 | if ((m_param->maxSlices > 1) & |
2033 | 0 | ((mvCand.y < m_sliceMinY) |
2034 | 0 | | (mvCand.y > m_sliceMaxY))) |
2035 | 0 | continue; |
2036 | 0 | } |
2037 | 0 | cu.clipMv(mvCand); |
2038 | | #if ENABLE_SCC_EXT |
2039 | | if (m_slice->m_param->bEnableSCC && !list && ref == m_slice->m_numRefIdx[0] - 1) |
2040 | | predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refFrameList[list][ref]->m_reconPic[1], mvCand); |
2041 | | else |
2042 | | #endif |
2043 | 0 | predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand); |
2044 | 0 | costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size); |
2045 | 0 | } |
2046 | |
|
2047 | 0 | return (costs[0] <= costs[1]) ? 0 : 1; |
2048 | 0 | } |
2049 | | |
2050 | | void Search::PME::processTasks(int workerThreadId) |
2051 | 0 | { |
2052 | | #if DETAILED_CU_STATS |
2053 | | int fe = mode.cu.m_encData->m_frameEncoderID; |
2054 | | master.m_stats[fe].countPMETasks++; |
2055 | | ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime); |
2056 | | #endif |
2057 | 0 | ProfileScopeEvent(pme); |
2058 | 0 | master.processPME(*this, master.m_tld[workerThreadId].analysis); |
2059 | 0 | } |
2060 | | |
2061 | | void Search::processPME(PME& pme, Search& slave) |
2062 | 0 | { |
2063 | | /* acquire a motion estimation job, else exit early */ |
2064 | 0 | int meId; |
2065 | 0 | pme.m_lock.acquire(); |
2066 | 0 | if (pme.m_jobTotal > pme.m_jobAcquired) |
2067 | 0 | { |
2068 | 0 | meId = pme.m_jobAcquired++; |
2069 | 0 | pme.m_lock.release(); |
2070 | 0 | } |
2071 | 0 | else |
2072 | 0 | { |
2073 | 0 | pme.m_lock.release(); |
2074 | 0 | return; |
2075 | 0 | } |
2076 | | |
2077 | | /* Setup slave Search instance for ME for master's CU */ |
2078 | 0 | if (&slave != this) |
2079 | 0 | { |
2080 | 0 | slave.m_slice = m_slice; |
2081 | 0 | slave.m_frame = m_frame; |
2082 | 0 | slave.m_param = m_param; |
2083 | 0 | slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp); |
2084 | 0 | bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400; |
2085 | 0 | slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma); |
2086 | 0 | } |
2087 | | |
2088 | | /* Perform ME, repeat until no more work is available */ |
2089 | 0 | do |
2090 | 0 | { |
2091 | 0 | if (meId < pme.m_jobs.refCnt[0]) |
2092 | 0 | { |
2093 | 0 | int refIdx = pme.m_jobs.ref[0][meId]; //L0 |
2094 | 0 | slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 0, refIdx); |
2095 | 0 | } |
2096 | 0 | else |
2097 | 0 | { |
2098 | 0 | int refIdx = pme.m_jobs.ref[1][meId - pme.m_jobs.refCnt[0]]; //L1 |
2099 | 0 | slave.singleMotionEstimation(*this, pme.mode, pme.pu, pme.puIdx, 1, refIdx); |
2100 | 0 | } |
2101 | |
|
2102 | 0 | meId = -1; |
2103 | 0 | pme.m_lock.acquire(); |
2104 | 0 | if (pme.m_jobTotal > pme.m_jobAcquired) |
2105 | 0 | meId = pme.m_jobAcquired++; |
2106 | 0 | pme.m_lock.release(); |
2107 | 0 | } |
2108 | 0 | while (meId >= 0); |
2109 | 0 | } |
2110 | | |
2111 | | void Search::singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref) |
2112 | 0 | { |
2113 | 0 | uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS; |
2114 | 0 | int numIdx = m_slice->m_numRefIdx[list]; |
2115 | | #if ENABLE_SCC_EXT |
2116 | | if (!list && m_ibcEnabled) |
2117 | | numIdx--; |
2118 | | #endif |
2119 | 0 | bits += getTUBits(ref, numIdx); |
2120 | |
|
2121 | 0 | MotionData* bestME = interMode.bestME[part]; |
2122 | | |
2123 | | // 12 mv candidates including lowresMV |
2124 | 0 | MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; |
2125 | | #if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT) |
2126 | | int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, 0, pu.puAbsPartIdx); |
2127 | | #else |
2128 | 0 | int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); |
2129 | 0 | #endif |
2130 | |
|
2131 | 0 | const MV* amvp = interMode.amvpCand[list][ref]; |
2132 | 0 | int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref); |
2133 | 0 | bool bLowresMVP = false; |
2134 | 0 | MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres; |
2135 | |
|
2136 | 0 | if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging if lowresMV is not available */ |
2137 | 0 | { |
2138 | 0 | MV lmv = getLowresMV(interMode.cu, pu, list, ref); |
2139 | 0 | int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0; |
2140 | 0 | if (lmv.notZero() && !layer) |
2141 | 0 | mvc[numMvc++] = lmv; |
2142 | 0 | if (m_param->bEnableHME) |
2143 | 0 | mvp_lowres = lmv; |
2144 | 0 | } |
2145 | |
|
2146 | 0 | m_vertRestriction = interMode.cu.m_slice->m_refPOCList[list][ref] == interMode.cu.m_slice->m_poc; |
2147 | 0 | setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax); |
2148 | |
|
2149 | 0 | int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction, |
2150 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2151 | |
|
2152 | 0 | if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp) |
2153 | 0 | { |
2154 | 0 | MV outmv_lowres; |
2155 | 0 | setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax); |
2156 | 0 | int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction, |
2157 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2158 | 0 | if (lowresMvCost < satdCost) |
2159 | 0 | { |
2160 | 0 | outmv = outmv_lowres; |
2161 | 0 | satdCost = lowresMvCost; |
2162 | 0 | bLowresMVP = true; |
2163 | 0 | } |
2164 | 0 | } |
2165 | | /* Get total cost of partition, but only include MV bit cost once */ |
2166 | 0 | bits += m_me.bitcost(outmv); |
2167 | 0 | uint32_t mvCost = m_me.mvcost(outmv); |
2168 | 0 | uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); |
2169 | | |
2170 | | /* Update LowresMVP to best AMVP cand*/ |
2171 | 0 | if (bLowresMVP) |
2172 | 0 | updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres); |
2173 | | |
2174 | | /* Refine MVP selection, updates: mvpIdx, bits, cost */ |
2175 | 0 | mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); |
2176 | | |
2177 | | /* tie goes to the smallest ref ID, just like --no-pme */ |
2178 | 0 | ScopedLock _lock(master.m_meLock); |
2179 | 0 | if (cost < bestME[list].cost || |
2180 | 0 | (cost == bestME[list].cost && ref < bestME[list].ref)) |
2181 | 0 | { |
2182 | 0 | bestME[list].mv = outmv; |
2183 | 0 | bestME[list].mvp = mvp; |
2184 | 0 | bestME[list].mvpIdx = mvpIdx; |
2185 | 0 | bestME[list].ref = ref; |
2186 | 0 | bestME[list].cost = cost; |
2187 | 0 | bestME[list].bits = bits; |
2188 | 0 | bestME[list].mvCost = mvCost; |
2189 | 0 | } |
2190 | 0 | } |
2191 | | void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc) |
2192 | 0 | { |
2193 | 0 | CUData& cu = interMode.cu; |
2194 | 0 | MV mv, mvmin, mvmax; |
2195 | 0 | int cand = 0, bestcost = INT_MAX; |
2196 | 0 | while (cand < m_param->mvRefine) |
2197 | 0 | { |
2198 | 0 | if ((cand && mvp[cand] == mvp[cand - 1]) || (cand == 2 && (mvp[cand] == mvp[cand - 2] || mvp[cand] == mvp[cand - 1]))) |
2199 | 0 | { |
2200 | 0 | cand++; |
2201 | 0 | continue; |
2202 | 0 | } |
2203 | 0 | MV bestMV; |
2204 | 0 | mv = mvp[cand++]; |
2205 | 0 | cu.clipMv(mv); |
2206 | 0 | m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc; |
2207 | 0 | setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax); |
2208 | 0 | int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices, m_vertRestriction, |
2209 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2210 | 0 | if (bestcost > cost) |
2211 | 0 | { |
2212 | 0 | bestcost = cost; |
2213 | 0 | outmv = bestMV; |
2214 | 0 | } |
2215 | 0 | } |
2216 | 0 | } |
2217 | | /* find the best inter prediction for each PU of specified mode */ |
2218 | | #if ENABLE_SCC_EXT |
2219 | | void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2], MV* iMVCandList) |
2220 | | #else |
2221 | | void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2]) |
2222 | | #endif |
2223 | 0 | { |
2224 | 0 | ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate); |
2225 | |
|
2226 | 0 | CUData& cu = interMode.cu; |
2227 | 0 | Yuv* predYuv = &interMode.predYuv; |
2228 | | |
2229 | | // 12 mv candidates including lowresMV |
2230 | 0 | MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; |
2231 | |
|
2232 | 0 | const Slice *slice = m_slice; |
2233 | 0 | int numPart = cu.getNumPartInter(0); |
2234 | 0 | int numPredDir = slice->isInterP() ? 1 : 2; |
2235 | 0 | const int* numRefIdx = slice->m_numRefIdx; |
2236 | 0 | uint32_t lastMode = 0; |
2237 | 0 | int totalmebits = 0; |
2238 | 0 | MV mvzero(0, 0); |
2239 | 0 | Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; |
2240 | 0 | MergeData merge; |
2241 | 0 | memset(&merge, 0, sizeof(merge)); |
2242 | 0 | bool useAsMVP = false; |
2243 | 0 | for (int puIdx = 0; puIdx < numPart; puIdx++) |
2244 | 0 | { |
2245 | 0 | MotionData* bestME = interMode.bestME[puIdx]; |
2246 | 0 | PredictionUnit pu(cu, cuGeom, puIdx); |
2247 | 0 | m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC); |
2248 | 0 | useAsMVP = false; |
2249 | 0 | x265_analysis_inter_data* interDataCTU = NULL; |
2250 | 0 | int cuIdx; |
2251 | 0 | cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx; |
2252 | 0 | if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1) |
2253 | 0 | { |
2254 | 0 | interDataCTU = m_frame->m_analysisData.interData; |
2255 | 0 | if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx]) |
2256 | 0 | && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx]) |
2257 | 0 | && !(interDataCTU->mergeFlag[cuIdx + puIdx]) |
2258 | 0 | && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx])) |
2259 | 0 | useAsMVP = true; |
2260 | 0 | } |
2261 | | /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */ |
2262 | 0 | uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge); |
2263 | 0 | bestME[0].cost = MAX_UINT; |
2264 | 0 | bestME[1].cost = MAX_UINT; |
2265 | |
|
2266 | 0 | getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); |
2267 | 0 | bool bDoUnidir = true; |
2268 | |
|
2269 | 0 | cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours); |
2270 | | /* Uni-directional prediction */ |
2271 | 0 | if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10) |
2272 | 0 | || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP)) |
2273 | 0 | { |
2274 | 0 | for (int list = 0; list < numPredDir; list++) |
2275 | 0 | { |
2276 | |
|
2277 | 0 | int ref = -1; |
2278 | 0 | if (useAsMVP) |
2279 | 0 | ref = interDataCTU->refIdx[list][cuIdx + puIdx]; |
2280 | 0 | else |
2281 | 0 | ref = bestME[list].ref; |
2282 | 0 | if (ref < 0) |
2283 | 0 | { |
2284 | 0 | continue; |
2285 | 0 | } |
2286 | 0 | uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS; |
2287 | 0 | int numIdx = m_slice->m_numRefIdx[list]; |
2288 | | #if ENABLE_SCC_EXT |
2289 | | if (!list && m_ibcEnabled) |
2290 | | numIdx--; |
2291 | | #endif |
2292 | 0 | bits += getTUBits(ref, numIdx); |
2293 | |
|
2294 | | #if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT) |
2295 | | int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx); |
2296 | | #else |
2297 | 0 | int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); |
2298 | 0 | #endif |
2299 | 0 | const MV* amvp = interMode.amvpCand[list][ref]; |
2300 | 0 | int mvpIdx = selectMVP(cu, pu, amvp, list, ref); |
2301 | 0 | MV mvmin, mvmax, outmv, mvp; |
2302 | 0 | if (useAsMVP) |
2303 | 0 | { |
2304 | 0 | mvp = interDataCTU->mv[list][cuIdx + puIdx].word; |
2305 | 0 | mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx]; |
2306 | 0 | } |
2307 | 0 | else |
2308 | 0 | mvp = amvp[mvpIdx]; |
2309 | 0 | if (m_param->searchMethod == X265_SEA) |
2310 | 0 | { |
2311 | 0 | int puX = puIdx & 1; |
2312 | 0 | int puY = puIdx >> 1; |
2313 | 0 | for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) |
2314 | 0 | m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride; |
2315 | 0 | } |
2316 | 0 | setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax); |
2317 | 0 | MV mvpIn = mvp; |
2318 | 0 | int satdCost; |
2319 | 0 | if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx) |
2320 | 0 | mvpIn = bestME[list].mv; |
2321 | 0 | if (useAsMVP && m_param->mvRefine > 1) |
2322 | 0 | { |
2323 | 0 | MV bestmv, mvpSel[3]; |
2324 | 0 | int mvpIdxSel[3]; |
2325 | 0 | satdCost = m_me.COST_MAX; |
2326 | 0 | mvpSel[0] = mvp; |
2327 | 0 | mvpIdxSel[0] = mvpIdx; |
2328 | 0 | mvpIdx = selectMVP(cu, pu, amvp, list, ref); |
2329 | 0 | mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx]; |
2330 | 0 | mvpIdxSel[1] = mvpIdx; |
2331 | 0 | if (m_param->mvRefine > 2) |
2332 | 0 | { |
2333 | 0 | mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx]; |
2334 | 0 | mvpIdxSel[2] = !mvpIdx; |
2335 | 0 | } |
2336 | 0 | for (int cand = 0; cand < m_param->mvRefine; cand++) |
2337 | 0 | { |
2338 | 0 | if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2]))) |
2339 | 0 | continue; |
2340 | 0 | setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax); |
2341 | 0 | int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices, m_vertRestriction, |
2342 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2343 | 0 | if (satdCost > bcost) |
2344 | 0 | { |
2345 | 0 | satdCost = bcost; |
2346 | 0 | outmv = bestmv; |
2347 | 0 | mvp = mvpSel[cand]; |
2348 | 0 | mvpIdx = mvpIdxSel[cand]; |
2349 | 0 | } |
2350 | 0 | } |
2351 | 0 | mvpIn = mvp; |
2352 | 0 | } |
2353 | 0 | else |
2354 | 0 | { |
2355 | 0 | satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction, |
2356 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2357 | 0 | } |
2358 | | |
2359 | | /* Get total cost of partition, but only include MV bit cost once */ |
2360 | 0 | bits += m_me.bitcost(outmv); |
2361 | 0 | uint32_t mvCost = m_me.mvcost(outmv); |
2362 | 0 | uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); |
2363 | | /* Refine MVP selection, updates: mvpIdx, bits, cost */ |
2364 | 0 | if (!(m_param->analysisMultiPassRefine || useAsMVP)) |
2365 | 0 | mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); |
2366 | 0 | else |
2367 | 0 | { |
2368 | | /* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here |
2369 | | the actual mvp is bestME from pass 1 for that mvpIdx */ |
2370 | 0 | int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn); |
2371 | 0 | if (diffBits < 0) |
2372 | 0 | { |
2373 | 0 | mvpIdx = !mvpIdx; |
2374 | 0 | uint32_t origOutBits = bits; |
2375 | 0 | bits = origOutBits + diffBits; |
2376 | 0 | cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits); |
2377 | 0 | } |
2378 | 0 | mvp = amvp[mvpIdx]; |
2379 | 0 | } |
2380 | |
|
2381 | 0 | if (cost < bestME[list].cost) |
2382 | 0 | { |
2383 | 0 | bestME[list].mv = outmv; |
2384 | 0 | bestME[list].mvp = mvp; |
2385 | 0 | bestME[list].mvpIdx = mvpIdx; |
2386 | 0 | bestME[list].cost = cost; |
2387 | 0 | bestME[list].bits = bits; |
2388 | 0 | bestME[list].mvCost = mvCost; |
2389 | 0 | bestME[list].ref = ref; |
2390 | 0 | } |
2391 | 0 | bDoUnidir = false; |
2392 | 0 | } |
2393 | 0 | } |
2394 | 0 | else if (m_param->bDistributeMotionEstimation) |
2395 | 0 | { |
2396 | 0 | PME pme(*this, interMode, cuGeom, pu, puIdx); |
2397 | 0 | pme.m_jobTotal = 0; |
2398 | 0 | pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */ |
2399 | |
|
2400 | 0 | uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1; |
2401 | 0 | for (int list = 0; list < numPredDir; list++) |
2402 | 0 | { |
2403 | 0 | int idx = 0; |
2404 | 0 | int numIdx = numRefIdx[list]; |
2405 | | #if ENABLE_SCC_EXT |
2406 | | if (!list && m_ibcEnabled) |
2407 | | numIdx--; |
2408 | | #endif |
2409 | 0 | for (int ref = 0; ref < numIdx; ref++) |
2410 | 0 | { |
2411 | 0 | if (!(refMask & (1 << ref))) |
2412 | 0 | continue; |
2413 | | |
2414 | 0 | pme.m_jobs.ref[list][idx++] = ref; |
2415 | 0 | pme.m_jobTotal++; |
2416 | 0 | } |
2417 | 0 | pme.m_jobs.refCnt[list] = idx; |
2418 | | |
2419 | | /* the second list ref bits start at bit 16 */ |
2420 | 0 | refMask >>= 16; |
2421 | 0 | } |
2422 | |
|
2423 | 0 | if (pme.m_jobTotal > 2) |
2424 | 0 | { |
2425 | 0 | pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1); |
2426 | |
|
2427 | 0 | processPME(pme, *this); |
2428 | |
|
2429 | 0 | int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0]; |
2430 | 0 | singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */ |
2431 | |
|
2432 | 0 | bDoUnidir = false; |
2433 | |
|
2434 | 0 | ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters); |
2435 | 0 | pme.waitForExit(); |
2436 | 0 | } |
2437 | | |
2438 | | /* if no peer threads were bonded, fall back to doing unidirectional |
2439 | | * searches ourselves without overhead of singleMotionEstimation() */ |
2440 | 0 | } |
2441 | 0 | if (bDoUnidir) |
2442 | 0 | { |
2443 | 0 | interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1; |
2444 | 0 | uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1; |
2445 | |
|
2446 | 0 | for (int list = 0; list < numPredDir; list++) |
2447 | 0 | { |
2448 | 0 | int numIdx = numRefIdx[list]; |
2449 | | #if ENABLE_SCC_EXT |
2450 | | if (!list && m_ibcEnabled) |
2451 | | numIdx--; |
2452 | | #endif |
2453 | 0 | for (int ref = 0; ref < numIdx; ref++) |
2454 | 0 | { |
2455 | 0 | ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]); |
2456 | |
|
2457 | 0 | if (!(refMask & (1 << ref))) |
2458 | 0 | { |
2459 | 0 | ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]); |
2460 | 0 | continue; |
2461 | 0 | } |
2462 | | |
2463 | 0 | uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS; |
2464 | 0 | bits += getTUBits(ref, numIdx); |
2465 | |
|
2466 | | #if (ENABLE_MULTIVIEW || ENABLE_SCC_EXT) |
2467 | | int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc, puIdx, pu.puAbsPartIdx); |
2468 | | #else |
2469 | 0 | int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); |
2470 | 0 | #endif |
2471 | |
|
2472 | 0 | const MV* amvp = interMode.amvpCand[list][ref]; |
2473 | 0 | int mvpIdx = selectMVP(cu, pu, amvp, list, ref); |
2474 | 0 | MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres; |
2475 | 0 | bool bLowresMVP = false; |
2476 | |
|
2477 | 0 | if (!strlen(m_param->analysisSave) && !strlen(m_param->analysisLoad)) /* Prevents load/save outputs from diverging when lowresMV is not available */ |
2478 | 0 | { |
2479 | 0 | MV lmv = getLowresMV(cu, pu, list, ref); |
2480 | 0 | int layer = m_param->numViews > 1 ? m_frame->m_viewId : (m_param->numScalableLayers > 1) ? m_frame->m_sLayerId : 0; |
2481 | 0 | if (lmv.notZero() && !layer) |
2482 | 0 | mvc[numMvc++] = lmv; |
2483 | 0 | if (m_param->bEnableHME) |
2484 | 0 | mvp_lowres = lmv; |
2485 | 0 | } |
2486 | 0 | if (m_param->searchMethod == X265_SEA) |
2487 | 0 | { |
2488 | 0 | int puX = puIdx & 1; |
2489 | 0 | int puY = puIdx >> 1; |
2490 | 0 | for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) |
2491 | 0 | m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic[0]->m_stride; |
2492 | 0 | } |
2493 | 0 | m_vertRestriction = cu.m_slice->m_refPOCList[list][ref] == cu.m_slice->m_poc; |
2494 | 0 | setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax); |
2495 | 0 | int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_vertRestriction, |
2496 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2497 | |
|
2498 | 0 | if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp) |
2499 | 0 | { |
2500 | 0 | MV outmv_lowres; |
2501 | 0 | setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax); |
2502 | 0 | int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices, m_vertRestriction, |
2503 | 0 | m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); |
2504 | 0 | if (lowresMvCost < satdCost) |
2505 | 0 | { |
2506 | 0 | outmv = outmv_lowres; |
2507 | 0 | satdCost = lowresMvCost; |
2508 | 0 | bLowresMVP = true; |
2509 | 0 | } |
2510 | 0 | } |
2511 | | |
2512 | | /* Get total cost of partition, but only include MV bit cost once */ |
2513 | 0 | bits += m_me.bitcost(outmv); |
2514 | 0 | uint32_t mvCost = m_me.mvcost(outmv); |
2515 | 0 | uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits); |
2516 | | /* Update LowresMVP to best AMVP cand*/ |
2517 | 0 | if (bLowresMVP) |
2518 | 0 | updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres); |
2519 | | |
2520 | | /* Refine MVP selection, updates: mvpIdx, bits, cost */ |
2521 | 0 | mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost); |
2522 | |
|
2523 | | #if ENABLE_SCC_EXT |
2524 | | if (m_param->bEnableSCC && (list <= 1 && ref <= 1 && (cu.m_partSize[0] == SIZE_2NxN || cu.m_partSize[0] == SIZE_Nx2N) && (1 << cu.m_log2CUSize[0]) <= 16)) |
2525 | | { |
2526 | | iMVCandList[4 * list + 2 * ref + puIdx] = outmv; |
2527 | | } |
2528 | | #endif |
2529 | |
|
2530 | 0 | if (cost < bestME[list].cost) |
2531 | 0 | { |
2532 | 0 | bestME[list].mv = outmv; |
2533 | 0 | bestME[list].mvp = mvp; |
2534 | 0 | bestME[list].mvpIdx = mvpIdx; |
2535 | 0 | bestME[list].ref = ref; |
2536 | 0 | bestME[list].cost = cost; |
2537 | 0 | bestME[list].bits = bits; |
2538 | 0 | bestME[list].mvCost = mvCost; |
2539 | 0 | } |
2540 | 0 | } |
2541 | | /* the second list ref bits start at bit 16 */ |
2542 | 0 | refMask >>= 16; |
2543 | 0 | } |
2544 | 0 | } |
2545 | | |
2546 | | /* Bi-directional prediction */ |
2547 | 0 | MotionData bidir[2]; |
2548 | 0 | uint32_t bidirCost = MAX_UINT; |
2549 | 0 | int bidirBits = 0; |
2550 | |
|
2551 | 0 | if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */ |
2552 | 0 | cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ |
2553 | 0 | bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT) |
2554 | 0 | { |
2555 | 0 | bidir[0] = bestME[0]; |
2556 | 0 | bidir[1] = bestME[1]; |
2557 | |
|
2558 | 0 | int satdCost; |
2559 | |
|
2560 | 0 | if (m_me.bChromaSATD) |
2561 | 0 | { |
2562 | 0 | cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv; |
2563 | 0 | cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref; |
2564 | 0 | cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv; |
2565 | 0 | cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref; |
2566 | 0 | motionCompensation(cu, pu, tmpPredYuv, true, true); |
2567 | |
|
2568 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) + |
2569 | 0 | m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx); |
2570 | 0 | } |
2571 | 0 | else |
2572 | 0 | { |
2573 | 0 | PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref]; |
2574 | 0 | PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref]; |
2575 | 0 | Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; |
2576 | | |
2577 | | /* Generate reference subpels */ |
2578 | 0 | predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv); |
2579 | 0 | predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv); |
2580 | 0 | primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size, |
2581 | 0 | bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32); |
2582 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); |
2583 | 0 | } |
2584 | |
|
2585 | 0 | bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); |
2586 | 0 | bidirCost = satdCost + m_rdCost.getCost(bidirBits); |
2587 | |
|
2588 | 0 | bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); |
2589 | 0 | if (bTryZero) |
2590 | 0 | { |
2591 | | /* Do not try zero MV if unidir motion predictors are beyond |
2592 | | * valid search area */ |
2593 | 0 | MV mvmin, mvmax; |
2594 | 0 | int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight); |
2595 | 0 | setSearchRange(cu, mvzero, merange, mvmin, mvmax); |
2596 | 0 | mvmax.y += 2; // there is some pad for subpel refine |
2597 | 0 | mvmin <<= 2; |
2598 | 0 | mvmax <<= 2; |
2599 | |
|
2600 | 0 | bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); |
2601 | 0 | bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); |
2602 | 0 | } |
2603 | 0 | if (bTryZero) |
2604 | 0 | { |
2605 | | /* coincident blocks of the two reference pictures */ |
2606 | 0 | if (m_me.bChromaSATD) |
2607 | 0 | { |
2608 | 0 | cu.m_mv[0][pu.puAbsPartIdx] = mvzero; |
2609 | 0 | cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref; |
2610 | 0 | cu.m_mv[1][pu.puAbsPartIdx] = mvzero; |
2611 | 0 | cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref; |
2612 | 0 | motionCompensation(cu, pu, tmpPredYuv, true, true); |
2613 | |
|
2614 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) + |
2615 | 0 | m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx); |
2616 | 0 | } |
2617 | 0 | else |
2618 | 0 | { |
2619 | 0 | const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx); |
2620 | 0 | const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx); |
2621 | 0 | intptr_t refStride = slice->m_mref[0][0].lumaStride; |
2622 | 0 | primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); |
2623 | 0 | satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); |
2624 | 0 | } |
2625 | 0 | MV mvp0 = bestME[0].mvp; |
2626 | 0 | int mvpIdx0 = bestME[0].mvpIdx; |
2627 | 0 | uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); |
2628 | |
|
2629 | 0 | MV mvp1 = bestME[1].mvp; |
2630 | 0 | int mvpIdx1 = bestME[1].mvpIdx; |
2631 | 0 | uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); |
2632 | |
|
2633 | 0 | uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); |
2634 | | |
2635 | | /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ |
2636 | 0 | mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost); |
2637 | 0 | mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost); |
2638 | |
|
2639 | 0 | if (cost < bidirCost) |
2640 | 0 | { |
2641 | 0 | bidir[0].mv = mvzero; |
2642 | 0 | bidir[1].mv = mvzero; |
2643 | 0 | bidir[0].mvp = mvp0; |
2644 | 0 | bidir[1].mvp = mvp1; |
2645 | 0 | bidir[0].mvpIdx = mvpIdx0; |
2646 | 0 | bidir[1].mvpIdx = mvpIdx1; |
2647 | 0 | bidirCost = cost; |
2648 | 0 | bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); |
2649 | 0 | } |
2650 | 0 | } |
2651 | 0 | } |
2652 | | |
2653 | | /* select best option and store into CU */ |
2654 | 0 | if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) |
2655 | 0 | { |
2656 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = true; |
2657 | 0 | cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */ |
2658 | 0 | cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx); |
2659 | 0 | cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx); |
2660 | 0 | cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx); |
2661 | 0 | cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx); |
2662 | 0 | cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx); |
2663 | |
|
2664 | 0 | totalmebits += merge.bits; |
2665 | 0 | } |
2666 | 0 | else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) |
2667 | 0 | { |
2668 | 0 | lastMode = 2; |
2669 | |
|
2670 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
2671 | 0 | cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx); |
2672 | 0 | cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx); |
2673 | 0 | cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx); |
2674 | 0 | cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; |
2675 | 0 | cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx; |
2676 | |
|
2677 | 0 | cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx); |
2678 | 0 | cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx); |
2679 | 0 | cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; |
2680 | 0 | cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx; |
2681 | |
|
2682 | 0 | totalmebits += bidirBits; |
2683 | 0 | } |
2684 | 0 | else if (bestME[0].cost <= bestME[1].cost) |
2685 | 0 | { |
2686 | 0 | lastMode = 0; |
2687 | |
|
2688 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
2689 | 0 | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); |
2690 | 0 | cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx); |
2691 | 0 | cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx); |
2692 | 0 | cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp; |
2693 | 0 | cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx; |
2694 | |
|
2695 | 0 | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); |
2696 | 0 | cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx); |
2697 | |
|
2698 | 0 | totalmebits += bestME[0].bits; |
2699 | 0 | } |
2700 | 0 | else |
2701 | 0 | { |
2702 | 0 | lastMode = 1; |
2703 | |
|
2704 | 0 | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
2705 | 0 | cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx); |
2706 | 0 | cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx); |
2707 | 0 | cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx); |
2708 | 0 | cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp; |
2709 | 0 | cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx; |
2710 | |
|
2711 | 0 | cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); |
2712 | 0 | cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx); |
2713 | |
|
2714 | 0 | totalmebits += bestME[1].bits; |
2715 | 0 | } |
2716 | |
|
2717 | 0 | motionCompensation(cu, pu, *predYuv, true, bChromaMC); |
2718 | 0 | } |
2719 | 0 | interMode.sa8dBits += totalmebits; |
2720 | 0 | } |
2721 | | |
2722 | | #if ENABLE_SCC_EXT |
2723 | | uint32_t Search::getSAD(pixel* ref, int refStride, const pixel* curr, int currStride, int width, int height) |
2724 | | { |
2725 | | uint32_t dist = 0; |
2726 | | |
2727 | | for (int i = 0; i < height; i++) |
2728 | | { |
2729 | | for (int j = 0; j < width; j++) |
2730 | | { |
2731 | | dist += abs(ref[j] - curr[j]); |
2732 | | } |
2733 | | ref += refStride; |
2734 | | curr += currStride; |
2735 | | } |
2736 | | return dist; |
2737 | | } |
2738 | | |
2739 | | int Search::intraBCSearchMVChromaRefine(Mode& intraBCMode, |
2740 | | const CUGeom& cuGeom, |
2741 | | int roiWidth, |
2742 | | int roiHeight, |
2743 | | int cuPelX, |
2744 | | int cuPelY, |
2745 | | uint32_t* sadBestCand, |
2746 | | MV* MVCand, |
2747 | | uint32_t partOffset, |
2748 | | int puIdx |
2749 | | ) |
2750 | | { |
2751 | | int bestCandIdx = 0; |
2752 | | uint32_t sadBest = UINT_MAX; |
2753 | | uint32_t tempSad; |
2754 | | |
2755 | | pixel* ref; |
2756 | | const pixel* picOrg; |
2757 | | int refStride, orgStride; |
2758 | | int width, height; |
2759 | | |
2760 | | int picWidth = m_slice->m_sps->picWidthInLumaSamples; |
2761 | | int picHeight = m_slice->m_sps->picHeightInLumaSamples; |
2762 | | |
2763 | | CUData& cu = intraBCMode.cu; |
2764 | | Yuv& tmpPredYuv = intraBCMode.predYuv; |
2765 | | PredictionUnit pu(cu, cuGeom, puIdx); |
2766 | | |
2767 | | for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++) |
2768 | | { |
2769 | | if ((!MVCand[cand].x) && (!MVCand[cand].y)) |
2770 | | { |
2771 | | continue; |
2772 | | } |
2773 | | |
2774 | | if (((int)(cuPelY + MVCand[cand].y + roiHeight) >= picHeight) || ((cuPelY + MVCand[cand].y) < 0)) |
2775 | | { |
2776 | | continue; |
2777 | | } |
2778 | | |
2779 | | if (((int)(cuPelX + MVCand[cand].x + roiWidth) >= picWidth) || ((cuPelX + MVCand[cand].x) < 0)) |
2780 | | { |
2781 | | continue; |
2782 | | } |
2783 | | |
2784 | | tempSad = sadBestCand[cand]; |
2785 | | int bitDepths = m_param->sourceBitDepth; |
2786 | | MV mvQuaterPixl = MVCand[cand]; |
2787 | | mvQuaterPixl <<= 2; |
2788 | | cu.setPUMv(0, mvQuaterPixl, pu.puAbsPartIdx, puIdx); |
2789 | | cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx); |
2790 | | cu.setPUMv(1, MV(), pu.puAbsPartIdx, puIdx); |
2791 | | cu.setPURefIdx(1, -1, pu.puAbsPartIdx, puIdx); |
2792 | | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); |
2793 | | |
2794 | | motionCompensation(cu, pu, tmpPredYuv, 1, 1); |
2795 | | |
2796 | | for (uint32_t ch = TEXT_CHROMA_U; ch < MAX_NUM_COMPONENT; ch++) |
2797 | | { |
2798 | | ref = m_slice->m_refFrameList[0][m_slice->m_numRefIdx[0] - 1]->m_reconPic[1]->getChromaAddr(ch, cu.m_cuAddr, cu.m_absIdxInCTU + partOffset); |
2799 | | |
2800 | | picOrg = intraBCMode.fencYuv->getChromaAddr(ch, partOffset); |
2801 | | orgStride = intraBCMode.fencYuv->m_csize; |
2802 | | |
2803 | | refStride = m_frame->m_reconPic[1]->m_strideC; |
2804 | | |
2805 | | width = roiWidth >> m_hChromaShift; |
2806 | | height = roiHeight >> m_vChromaShift; |
2807 | | |
2808 | | ref = tmpPredYuv.getChromaAddr(ch, partOffset); |
2809 | | refStride = tmpPredYuv.m_csize; |
2810 | | |
2811 | | for (int row = 0; row < height; row++) |
2812 | | { |
2813 | | for (int col = 0; col < width; col++) |
2814 | | { |
2815 | | tempSad += ((abs(ref[col] - picOrg[col])) >> (bitDepths - 8)); |
2816 | | } |
2817 | | ref += refStride; |
2818 | | picOrg += orgStride; |
2819 | | } |
2820 | | } |
2821 | | |
2822 | | if (tempSad < sadBest) |
2823 | | { |
2824 | | sadBest = tempSad; |
2825 | | bestCandIdx = cand; |
2826 | | } |
2827 | | } |
2828 | | |
2829 | | return bestCandIdx; |
2830 | | } |
2831 | | |
2832 | | void Search::updateBVMergeCandLists(int roiWidth, int roiHeight, MV* mvCand, IBC& ibc) |
2833 | | { |
2834 | | if (roiWidth + roiHeight > 8) |
2835 | | { |
2836 | | ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvCand, CHROMA_REFINEMENT_CANDIDATES, false); |
2837 | | |
2838 | | if (roiWidth + roiHeight == 32) |
2839 | | { |
2840 | | ibc.m_numBV16s = ibc.m_numBVs; |
2841 | | } |
2842 | | } |
2843 | | } |
2844 | | |
2845 | | void Search::intraBCSearchMVCandUpdate(uint32_t sad, int x, int y, uint32_t* sadBestCand, MV* MVCand) |
2846 | | { |
2847 | | int j = CHROMA_REFINEMENT_CANDIDATES - 1; |
2848 | | |
2849 | | if (sad < sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1]) |
2850 | | { |
2851 | | for (int t = CHROMA_REFINEMENT_CANDIDATES - 1; t >= 0; t--) |
2852 | | { |
2853 | | if (sad < sadBestCand[t]) |
2854 | | { |
2855 | | j = t; |
2856 | | } |
2857 | | } |
2858 | | |
2859 | | for (int k = CHROMA_REFINEMENT_CANDIDATES - 1; k > j; k--) |
2860 | | { |
2861 | | sadBestCand[k] = sadBestCand[k - 1]; |
2862 | | |
2863 | | MVCand[k].set(MVCand[k - 1].x, MVCand[k - 1].y); |
2864 | | } |
2865 | | sadBestCand[j] = sad; |
2866 | | MVCand[j].set(x, y); |
2867 | | } |
2868 | | } |
2869 | | |
2870 | | uint32_t Search::mergeCandLists(MV* dst, uint32_t dn, MV* src, uint32_t sn, bool isSrcQuarPel) |
2871 | | { |
2872 | | for (uint32_t cand = 0; cand < sn && dn < SCM_S0067_NUM_CANDIDATES; cand++) |
2873 | | { |
2874 | | bool found = false; |
2875 | | MV TempMv = src[cand]; |
2876 | | if (!isSrcQuarPel) |
2877 | | { |
2878 | | TempMv <<= 2; |
2879 | | } |
2880 | | for (uint32_t j = 0; j < dn; j++) |
2881 | | { |
2882 | | if (TempMv == dst[j]) |
2883 | | { |
2884 | | found = true; |
2885 | | break; |
2886 | | } |
2887 | | } |
2888 | | |
2889 | | if (!found) |
2890 | | { |
2891 | | dst[dn] = TempMv; |
2892 | | dn++; |
2893 | | } |
2894 | | } |
2895 | | return dn; |
2896 | | } |
2897 | | |
2898 | | void Search::restrictBipredMergeCand(CUData* cu, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours, uint32_t numValidMergeCand) |
2899 | | { |
2900 | | { |
2901 | | for (uint32_t mergeCand = 0; mergeCand < numValidMergeCand; ++mergeCand) |
2902 | | { |
2903 | | if (interDirNeighbours[mergeCand] == 3) |
2904 | | { |
2905 | | bool b8x8BiPredRestricted = cu->is8x8BipredRestriction( |
2906 | | mvFieldNeighbours[mergeCand][0].mv, |
2907 | | mvFieldNeighbours[mergeCand][1].mv, |
2908 | | mvFieldNeighbours[mergeCand][0].refIdx, |
2909 | | mvFieldNeighbours[mergeCand][1].refIdx); |
2910 | | |
2911 | | int width = 0; |
2912 | | int height = 0; |
2913 | | uint32_t partAddr; |
2914 | | |
2915 | | cu->getPartIndexAndSize(puIdx, partAddr, width, height); |
2916 | | if (b8x8BiPredRestricted) |
2917 | | { |
2918 | | if (width <= 8 && height <= 8) |
2919 | | { |
2920 | | interDirNeighbours[mergeCand] = 1; |
2921 | | mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID; |
2922 | | } |
2923 | | } |
2924 | | else if (cu->isBipredRestriction()) |
2925 | | { |
2926 | | interDirNeighbours[mergeCand] = 1; |
2927 | | mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID; |
2928 | | } |
2929 | | } |
2930 | | } |
2931 | | } |
2932 | | } |
2933 | | |
2934 | | bool Search::isBlockVectorValid(int xPos, int yPos, int width, int height, CUData* cu, |
2935 | | int xStartInCU, int yStartInCU, int xBv, int yBv, int ctuSize) |
2936 | | { |
2937 | | static const int s_floorLog2[65] = |
2938 | | { |
2939 | | -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, |
2940 | | 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, |
2941 | | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
2942 | | 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, |
2943 | | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
2944 | | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
2945 | | 5, 5, 5, 5, 6 |
2946 | | }; |
2947 | | |
2948 | | int ctuSizeLog2 = s_floorLog2[ctuSize]; |
2949 | | int interpolationSamplesX = (cu->m_chromaFormat == X265_CSP_I422 || cu->m_chromaFormat == X265_CSP_I420) ? ((xBv & 0x1) << 1) : 0; |
2950 | | int interpolationSamplesY = (cu->m_chromaFormat == X265_CSP_I420) ? ((yBv & 0x1) << 1) : 0; |
2951 | | int refRightX = xPos + xBv + width - 1 + interpolationSamplesX; |
2952 | | int refBottomY = yPos + yBv + height - 1 + interpolationSamplesY; |
2953 | | int picWidth = m_slice->m_sps->picWidthInLumaSamples; |
2954 | | int picHeight = m_slice->m_sps->picHeightInLumaSamples; |
2955 | | |
2956 | | if ((xPos + xBv - interpolationSamplesX) < 0) |
2957 | | return false; |
2958 | | if (refRightX >= picWidth) |
2959 | | return false; |
2960 | | if ((yPos + yBv - interpolationSamplesY) < 0) |
2961 | | return false; |
2962 | | if (refBottomY >= picHeight) |
2963 | | return false; |
2964 | | |
2965 | | if ((xBv + width + interpolationSamplesX) > 0 && (yBv + height + interpolationSamplesY) > 0) |
2966 | | return false; |
2967 | | |
2968 | | if (refBottomY >> ctuSizeLog2 < yPos >> ctuSizeLog2) |
2969 | | { |
2970 | | int refCuX = refRightX / ctuSize; |
2971 | | int refCuY = refBottomY / ctuSize; |
2972 | | int cuPelX = xPos / ctuSize; |
2973 | | int cuPelY = yPos / ctuSize; |
2974 | | |
2975 | | if (((int)(refCuX - cuPelX) > (int)((cuPelY - refCuY)))) |
2976 | | return false; |
2977 | | else |
2978 | | return true; |
2979 | | } |
2980 | | |
2981 | | if (refBottomY >> ctuSizeLog2 > yPos >> ctuSizeLog2) |
2982 | | { |
2983 | | return false; |
2984 | | } |
2985 | | |
2986 | | // in the same CTU line |
2987 | | if (refRightX >> ctuSizeLog2 < xPos >> ctuSizeLog2) |
2988 | | return true; |
2989 | | if (refRightX >> ctuSizeLog2 > xPos >> ctuSizeLog2) |
2990 | | return false; |
2991 | | |
2992 | | // same CTU |
2993 | | int mask = 1 << ctuSizeLog2; |
2994 | | mask -= 1; |
2995 | | int rasterCurr = ((((yPos & mask) - yStartInCU) >> 2) << (ctuSizeLog2 - 2)) + (((xPos & mask) - xStartInCU) >> 2); |
2996 | | int rasterRef = (((refBottomY & mask) >> 2) << (ctuSizeLog2 - 2)) + ((refRightX & mask) >> 2); |
2997 | | |
2998 | | if (g_rasterToZscan[rasterRef] >= g_rasterToZscan[rasterCurr]) |
2999 | | return false; |
3000 | | return true; |
3001 | | } |
3002 | | |
3003 | | bool Search::isValidIntraBCSearchArea(CUData* cu, int predX, int predY, int roiWidth, int roiHeight, int partOffset) |
3004 | | { |
3005 | | const int cuPelX = cu->m_cuPelX + g_zscanToPelX[partOffset]; |
3006 | | const int cuPelY = cu->m_cuPelY + g_zscanToPelY[partOffset]; |
3007 | | |
3008 | | if (!isBlockVectorValid(cuPelX, cuPelY, roiWidth, roiHeight, cu, g_zscanToPelX[partOffset], g_zscanToPelY[partOffset], predX, predY, m_param->maxCUSize)) |
3009 | | { |
3010 | | return false; |
3011 | | } |
3012 | | return true; |
3013 | | } |
3014 | | |
3015 | | void Search::intraPatternSearch(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, uint32_t partAddr, pixel* refY, int refStride, MV* searchRangeLT, MV* searchRangeRB, |
3016 | | MV& mv, uint32_t& cost, int roiWidth, int roiHeight, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc) |
3017 | | { |
3018 | | const int srchRngHorLeft = searchRangeLT->x; |
3019 | | const int srchRngHorRight = searchRangeRB->x; |
3020 | | const int srchRngVerTop = searchRangeLT->y; |
3021 | | const int srchRngVerBottom = searchRangeRB->y; |
3022 | | |
3023 | | CUData& cu = intraBCMode.cu; |
3024 | | const uint32_t lcuWidth = m_param->maxCUSize; |
3025 | | const uint32_t lcuHeight = m_param->maxCUSize; |
3026 | | const int puPelOffsetX = g_zscanToPelX[partAddr]; |
3027 | | const int puPelOffsetY = g_zscanToPelY[partAddr]; |
3028 | | const int cuPelX = cu.m_cuPelX + puPelOffsetX; // Point to the location of PU |
3029 | | const int cuPelY = cu.m_cuPelY + puPelOffsetY; |
3030 | | |
3031 | | uint32_t sad = 0; |
3032 | | uint32_t sadBest = UINT_MAX; |
3033 | | int bestX = 0; |
3034 | | int bestY = 0; |
3035 | | pixel* refSrch; |
3036 | | |
3037 | | int bestCandIdx = 0; |
3038 | | uint32_t partOffset = 0; |
3039 | | MV MVCand[CHROMA_REFINEMENT_CANDIDATES]; |
3040 | | uint32_t sadBestCand[CHROMA_REFINEMENT_CANDIDATES]; |
3041 | | |
3042 | | partOffset = partAddr; |
3043 | | PredictionUnit pu(cu, cuGeom, puIdx); |
3044 | | for (int cand = 0; cand < CHROMA_REFINEMENT_CANDIDATES; cand++) |
3045 | | { |
3046 | | sadBestCand[cand] = UINT_MAX; |
3047 | | MVCand[cand].set(0, 0); |
3048 | | } |
3049 | | |
3050 | | const int relCUPelX = cuPelX % lcuWidth; |
3051 | | const int relCUPelY = cuPelY % lcuHeight; |
3052 | | const int chromaROIWidthInPixels = roiWidth; |
3053 | | const int chromaROIHeightInPixels = roiHeight; |
3054 | | bool fastsearch = (m_param->bEnableSCC == 1) ? true : false; |
3055 | | bool isFullFrameSearchrangeEnabled = false; // disabled by default |
3056 | | |
3057 | | if (fastsearch) |
3058 | | { |
3059 | | uint32_t tempSadBest = 0; |
3060 | | int srLeft = srchRngHorLeft, srRight = srchRngHorRight, srTop = srchRngVerTop, srBottom = srchRngVerBottom; |
3061 | | const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples; |
3062 | | const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples; |
3063 | | |
3064 | | if (isFullFrameSearchrangeEnabled)//full frame search |
3065 | | { |
3066 | | srLeft = -1 * cuPelX; |
3067 | | srTop = -1 * cuPelY; |
3068 | | |
3069 | | srRight = picWidth - cuPelX - roiWidth; |
3070 | | srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight; |
3071 | | |
3072 | | if (cuPelX + srRight + roiWidth > (int)picWidth) |
3073 | | { |
3074 | | srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth; |
3075 | | } |
3076 | | if (cuPelY + srBottom + roiHeight > (int)picHeight) |
3077 | | { |
3078 | | srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight; |
3079 | | } |
3080 | | } |
3081 | | |
3082 | | if (roiWidth > 8 || roiHeight > 8) |
3083 | | ibc.m_numBVs = 0; |
3084 | | else if (roiWidth + roiHeight == 16) |
3085 | | ibc.m_numBVs = ibc.m_numBV16s; |
3086 | | if (testOnlyPred) |
3087 | | ibc.m_numBVs = 0; |
3088 | | |
3089 | | MV mvPredEncOnly[16]; |
3090 | | int nbPreds = 0; |
3091 | | cu.getIntraBCMVPsEncOnly(partAddr, mvPredEncOnly, nbPreds, puIdx); |
3092 | | ibc.m_numBVs = mergeCandLists(ibc.m_BVs, ibc.m_numBVs, mvPredEncOnly, nbPreds, true); |
3093 | | |
3094 | | for (int cand = 0; cand < ibc.m_numBVs; cand++) |
3095 | | { |
3096 | | int xPred = ibc.m_BVs[cand].x >> 2; |
3097 | | int yPred = ibc.m_BVs[cand].y >> 2; |
3098 | | if (!(xPred == 0 && yPred == 0) && !((yPred < srTop) || (yPred > srBottom)) && !((xPred < srLeft) || (xPred > srRight))) |
3099 | | { |
3100 | | int tempY = yPred + relCUPelY + roiHeight - 1; |
3101 | | int tempX = xPred + relCUPelX + roiWidth - 1; |
3102 | | bool validCand = isValidIntraBCSearchArea(&cu, xPred, yPred, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset); |
3103 | | |
3104 | | if ((tempX >= (int)lcuWidth) && (tempY >= 0) && isFullFrameSearchrangeEnabled) |
3105 | | validCand = false; |
3106 | | |
3107 | | if ((tempX >= 0) && (tempY >= 0)) |
3108 | | { |
3109 | | int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4); |
3110 | | uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx]; |
3111 | | if (tempZscanIdx >= cu.m_absIdxInCTU) |
3112 | | { |
3113 | | validCand = false; |
3114 | | } |
3115 | | } |
3116 | | |
3117 | | if (validCand) |
3118 | | { |
3119 | | sad = m_me.mvcost(ibc.m_BVs[cand]); |
3120 | | |
3121 | | refSrch = refY + yPred * refStride + xPred; |
3122 | | |
3123 | | sad += m_me.bufSAD(refSrch, refStride); |
3124 | | if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1]) |
3125 | | { |
3126 | | continue; |
3127 | | } |
3128 | | |
3129 | | intraBCSearchMVCandUpdate(sad, xPred, yPred, sadBestCand, MVCand); |
3130 | | } |
3131 | | } |
3132 | | } |
3133 | | bestX = MVCand[0].x; |
3134 | | bestY = MVCand[0].y; |
3135 | | mv.set(bestX, bestY); |
3136 | | sadBest = sadBestCand[0]; |
3137 | | |
3138 | | if (testOnlyPred) |
3139 | | { |
3140 | | cost = sadBest; |
3141 | | return; |
3142 | | } |
3143 | | |
3144 | | const int boundY = (0 - roiHeight - puPelOffsetY); |
3145 | | int lowY = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled) |
3146 | | ? -cuPelY : X265_MAX(srchRngVerTop, 0 - cuPelY); |
3147 | | for (int y = boundY; y >= lowY; y--) |
3148 | | { |
3149 | | if (!isValidIntraBCSearchArea(&cu, 0, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset)) |
3150 | | { |
3151 | | continue; |
3152 | | } |
3153 | | |
3154 | | sad = m_me.mvcost(MV(0, y)); |
3155 | | |
3156 | | refSrch = refY + y * refStride; |
3157 | | |
3158 | | sad += m_me.bufSAD(refSrch, refStride); |
3159 | | if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1]) |
3160 | | { |
3161 | | continue; |
3162 | | } |
3163 | | |
3164 | | intraBCSearchMVCandUpdate(sad, 0, y, sadBestCand, MVCand); |
3165 | | tempSadBest = sadBestCand[0]; |
3166 | | if (sadBestCand[0] <= 3) |
3167 | | { |
3168 | | bestX = MVCand[0].x; |
3169 | | bestY = MVCand[0].y; |
3170 | | sadBest = sadBestCand[0]; |
3171 | | mv.set(bestX, bestY); |
3172 | | cost = sadBest; |
3173 | | |
3174 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3175 | | return; |
3176 | | } |
3177 | | } |
3178 | | |
3179 | | const int boundX = ((cu.m_partSize[partAddr] == SCM_S0067_IBC_FULL_1D_SEARCH_FOR_PU) && isFullFrameSearchrangeEnabled) |
3180 | | ? -cuPelX : X265_MAX(srchRngHorLeft, -cuPelX); |
3181 | | for (int x = 0 - roiWidth - puPelOffsetX; x >= boundX; --x) |
3182 | | { |
3183 | | if (!isValidIntraBCSearchArea(&cu, x, 0, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset)) |
3184 | | { |
3185 | | continue; |
3186 | | } |
3187 | | |
3188 | | sad = m_me.mvcost(MV(x, 0)); |
3189 | | |
3190 | | refSrch = refY + x; |
3191 | | sad += m_me.bufSAD(refSrch, refStride); |
3192 | | |
3193 | | if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1]) |
3194 | | { |
3195 | | continue; |
3196 | | } |
3197 | | |
3198 | | intraBCSearchMVCandUpdate(sad, x, 0, sadBestCand, MVCand); |
3199 | | tempSadBest = sadBestCand[0]; |
3200 | | if (sadBestCand[0] <= 3) |
3201 | | { |
3202 | | bestX = MVCand[0].x; |
3203 | | bestY = MVCand[0].y; |
3204 | | sadBest = sadBestCand[0]; |
3205 | | mv.set(bestX, bestY); |
3206 | | cost = sadBest; |
3207 | | |
3208 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3209 | | return; |
3210 | | } |
3211 | | } |
3212 | | |
3213 | | bestX = MVCand[0].x; |
3214 | | bestY = MVCand[0].y; |
3215 | | sadBest = sadBestCand[0]; |
3216 | | |
3217 | | if ((!bestX && !bestY) || (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 32)) |
3218 | | { |
3219 | | //chroma refine |
3220 | | bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx); |
3221 | | bestX = MVCand[bestCandIdx].x; |
3222 | | bestY = MVCand[bestCandIdx].y; |
3223 | | sadBest = sadBestCand[bestCandIdx]; |
3224 | | mv.set(bestX, bestY); |
3225 | | cost = sadBest; |
3226 | | |
3227 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3228 | | return; |
3229 | | } |
3230 | | |
3231 | | if (cuGeom.depth > 2 && !bUse1DSearchFor8x8) |
3232 | | { |
3233 | | for (int y = X265_MAX(srchRngVerTop, -cuPelY); y <= srchRngVerBottom; y += 2) |
3234 | | { |
3235 | | if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight)) |
3236 | | { |
3237 | | continue; |
3238 | | } |
3239 | | |
3240 | | int tempY = y + relCUPelY + roiHeight - 1; |
3241 | | |
3242 | | for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x++) |
3243 | | { |
3244 | | if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth)) |
3245 | | { |
3246 | | continue; |
3247 | | } |
3248 | | |
3249 | | int tempX = x + relCUPelX + roiWidth - 1; |
3250 | | |
3251 | | if ((tempX >= 0) && (tempY >= 0)) |
3252 | | { |
3253 | | int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4); |
3254 | | uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx]; |
3255 | | if (iTempZscanIdx >= cu.m_absIdxInCTU) |
3256 | | { |
3257 | | continue; |
3258 | | } |
3259 | | } |
3260 | | |
3261 | | if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset)) |
3262 | | { |
3263 | | continue; |
3264 | | } |
3265 | | |
3266 | | sad = m_me.mvcost(MV(x, y)); |
3267 | | |
3268 | | refSrch = refY + y * refStride + x; |
3269 | | sad += m_me.bufSAD(refSrch, refStride); |
3270 | | |
3271 | | intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand); |
3272 | | } |
3273 | | } |
3274 | | |
3275 | | bestX = MVCand[0].x; |
3276 | | bestY = MVCand[0].y; |
3277 | | sadBest = sadBestCand[0]; |
3278 | | if (sadBest - m_me.mvcost(MV(bestX, bestY)) <= 16) |
3279 | | { |
3280 | | //chroma refine |
3281 | | bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx); |
3282 | | bestX = MVCand[bestCandIdx].x; |
3283 | | bestY = MVCand[bestCandIdx].y; |
3284 | | sadBest = sadBestCand[bestCandIdx]; |
3285 | | mv.set(bestX, bestY); |
3286 | | cost = sadBest; |
3287 | | |
3288 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3289 | | return; |
3290 | | } |
3291 | | |
3292 | | for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2) |
3293 | | { |
3294 | | if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight)) |
3295 | | { |
3296 | | continue; |
3297 | | } |
3298 | | |
3299 | | int tempY = y + relCUPelY + roiHeight - 1; |
3300 | | |
3301 | | for (int x = X265_MAX(srchRngHorLeft, -cuPelX); x <= srchRngHorRight; x += 2) |
3302 | | { |
3303 | | if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth)) |
3304 | | { |
3305 | | continue; |
3306 | | } |
3307 | | |
3308 | | int tempX = x + relCUPelX + roiWidth - 1; |
3309 | | |
3310 | | if ((tempX >= 0) && (tempY >= 0)) |
3311 | | { |
3312 | | int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4); |
3313 | | uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx]; |
3314 | | if (tempZscanIdx >= cu.m_absIdxInCTU) |
3315 | | { |
3316 | | continue; |
3317 | | } |
3318 | | } |
3319 | | |
3320 | | if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset)) |
3321 | | { |
3322 | | continue; |
3323 | | } |
3324 | | |
3325 | | sad = m_me.mvcost(MV(x, y)); |
3326 | | |
3327 | | refSrch = refY + y * refStride + x; |
3328 | | sad += m_me.bufSAD(refSrch, refStride); |
3329 | | |
3330 | | if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1]) |
3331 | | { |
3332 | | continue; |
3333 | | } |
3334 | | |
3335 | | intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand); |
3336 | | if (sadBestCand[0] <= 5) |
3337 | | { |
3338 | | //chroma refine & return |
3339 | | bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx); |
3340 | | bestX = MVCand[bestCandIdx].x; |
3341 | | bestY = MVCand[bestCandIdx].y; |
3342 | | sadBest = sadBestCand[bestCandIdx]; |
3343 | | mv.set(bestX, bestY); |
3344 | | cost = sadBest; |
3345 | | |
3346 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3347 | | return; |
3348 | | } |
3349 | | } |
3350 | | } |
3351 | | |
3352 | | bestX = MVCand[0].x; |
3353 | | bestY = MVCand[0].y; |
3354 | | sadBest = sadBestCand[0]; |
3355 | | |
3356 | | if ((sadBest >= tempSadBest) || ((sadBest - m_me.mvcost(MV(bestX, bestY))) <= 32)) |
3357 | | { |
3358 | | //chroma refine |
3359 | | bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx); |
3360 | | bestX = MVCand[bestCandIdx].x; |
3361 | | bestY = MVCand[bestCandIdx].y; |
3362 | | sadBest = sadBestCand[bestCandIdx]; |
3363 | | mv.set(bestX, bestY); |
3364 | | cost = sadBest; |
3365 | | |
3366 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3367 | | return; |
3368 | | } |
3369 | | |
3370 | | tempSadBest = sadBestCand[0]; |
3371 | | |
3372 | | |
3373 | | for (int y = (X265_MAX(srchRngVerTop, -cuPelY) + 1); y <= srchRngVerBottom; y += 2) |
3374 | | { |
3375 | | if ((y == 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight)) |
3376 | | { |
3377 | | continue; |
3378 | | } |
3379 | | |
3380 | | int tempY = y + relCUPelY + roiHeight - 1; |
3381 | | |
3382 | | for (int x = (X265_MAX(srchRngHorLeft, -cuPelX) + 1); x <= srchRngHorRight; x += 2) |
3383 | | { |
3384 | | |
3385 | | if ((x == 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth)) |
3386 | | { |
3387 | | continue; |
3388 | | } |
3389 | | |
3390 | | int tempX = x + relCUPelX + roiWidth - 1; |
3391 | | |
3392 | | if ((tempX >= 0) && (tempY >= 0)) |
3393 | | { |
3394 | | int tempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4); |
3395 | | uint32_t tempZscanIdx = g_rasterToZscan[tempRasterIdx]; |
3396 | | if (tempZscanIdx >= cu.m_absIdxInCTU) |
3397 | | { |
3398 | | continue; |
3399 | | } |
3400 | | } |
3401 | | |
3402 | | if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset)) |
3403 | | { |
3404 | | continue; |
3405 | | } |
3406 | | |
3407 | | sad = m_me.mvcost(MV(x, y)); |
3408 | | |
3409 | | refSrch = refY + y * refStride + x; |
3410 | | sad += m_me.bufSAD(refSrch, refStride); |
3411 | | if (sad > sadBestCand[CHROMA_REFINEMENT_CANDIDATES - 1]) |
3412 | | { |
3413 | | continue; |
3414 | | } |
3415 | | |
3416 | | intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand); |
3417 | | if (sadBestCand[0] <= 5) |
3418 | | { |
3419 | | //chroma refine & return |
3420 | | bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx); |
3421 | | bestX = MVCand[bestCandIdx].x; |
3422 | | bestY = MVCand[bestCandIdx].y; |
3423 | | sadBest = sadBestCand[bestCandIdx]; |
3424 | | mv.set(bestX, bestY); |
3425 | | cost = sadBest; |
3426 | | |
3427 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3428 | | return; |
3429 | | } |
3430 | | } |
3431 | | } |
3432 | | } |
3433 | | } |
3434 | | else //full search |
3435 | | { |
3436 | | refY += (srchRngVerBottom * refStride); |
3437 | | int picWidth = m_slice->m_sps->picWidthInLumaSamples; |
3438 | | int picHeight = m_slice->m_sps->picHeightInLumaSamples; |
3439 | | |
3440 | | for (int y = srchRngVerBottom; y >= srchRngVerTop; y--) |
3441 | | { |
3442 | | if (((int)(cuPelY + y) < 0) || ((int)(cuPelY + y + roiHeight) >= (int)picHeight)) |
3443 | | { |
3444 | | refY -= refStride; |
3445 | | continue; |
3446 | | } |
3447 | | |
3448 | | for (int x = srchRngHorLeft; x <= srchRngHorRight; x++) |
3449 | | { |
3450 | | |
3451 | | if (((int)(cuPelX + x) < 0) || ((int)(cuPelX + x + roiWidth) >= (int)picWidth)) |
3452 | | { |
3453 | | continue; |
3454 | | } |
3455 | | |
3456 | | int tempX = x + relCUPelX + roiWidth - 1; |
3457 | | int tempY = y + relCUPelY + roiHeight - 1; |
3458 | | if ((tempX >= 0) && (tempY >= 0)) |
3459 | | { |
3460 | | int iTempRasterIdx = (tempY / 4) * cu.s_numPartInCUSize + (tempX / 4); |
3461 | | uint32_t iTempZscanIdx = g_rasterToZscan[iTempRasterIdx]; |
3462 | | if (iTempZscanIdx >= cu.m_absIdxInCTU) |
3463 | | { |
3464 | | continue; |
3465 | | } |
3466 | | } |
3467 | | |
3468 | | if (!isValidIntraBCSearchArea(&cu, x, y, chromaROIWidthInPixels, chromaROIHeightInPixels, partOffset)) |
3469 | | { |
3470 | | continue; |
3471 | | } |
3472 | | |
3473 | | refSrch = refY + x; |
3474 | | |
3475 | | sad = m_me.bufSAD(refSrch, refStride); |
3476 | | sad += m_me.mvcost(MV(x, y)); |
3477 | | if (sad < sadBest) |
3478 | | { |
3479 | | sadBest = sad; |
3480 | | bestX = x; |
3481 | | bestY = y; |
3482 | | } |
3483 | | intraBCSearchMVCandUpdate(sad, x, y, sadBestCand, MVCand); |
3484 | | } |
3485 | | |
3486 | | refY -= refStride; |
3487 | | } |
3488 | | } |
3489 | | |
3490 | | bestCandIdx = intraBCSearchMVChromaRefine(intraBCMode, cuGeom, roiWidth, roiHeight, cuPelX, cuPelY, sadBestCand, MVCand, partOffset, puIdx); |
3491 | | bestX = MVCand[bestCandIdx].x; |
3492 | | bestY = MVCand[bestCandIdx].y; |
3493 | | sadBest = sadBestCand[bestCandIdx]; |
3494 | | mv.set(bestX, bestY); |
3495 | | cost = sadBest; |
3496 | | |
3497 | | updateBVMergeCandLists(roiWidth, roiHeight, MVCand, ibc); |
3498 | | |
3499 | | } |
3500 | | |
3501 | | void Search::setIntraSearchRange(Mode& intraBCMode, MV& pred, int puIdx, int roiWidth, int roiHeight, MV& searchRangeLT, MV& searchRangeRB) |
3502 | | { |
3503 | | MV mvPred = pred; |
3504 | | CUData& cu = intraBCMode.cu; |
3505 | | cu.clipMv(mvPred); |
3506 | | int srLeft, srRight, srTop, srBottom; |
3507 | | int puWidth, puHeight; |
3508 | | uint32_t partAddr; |
3509 | | |
3510 | | cu.getPartIndexAndSize(puIdx, partAddr, puWidth, puHeight); |
3511 | | |
3512 | | const uint32_t lcuWidth = m_param->maxCUSize; |
3513 | | const uint32_t lcuHeight = m_param->maxCUSize; |
3514 | | const uint32_t cuPelX = cu.m_cuPelX + g_zscanToPelX[partAddr]; |
3515 | | const uint32_t cuPelY = cu.m_cuPelY + g_zscanToPelY[partAddr]; |
3516 | | |
3517 | | const uint32_t picWidth = m_slice->m_sps->picWidthInLumaSamples; |
3518 | | const uint32_t picHeight = m_slice->m_sps->picHeightInLumaSamples; |
3519 | | bool isFullFrameSearchrangeEnabled = false; // disabled by default |
3520 | | if (1 << cu.m_log2CUSize[0] == 16 && cu.m_partSize[0] == SIZE_2Nx2N && isFullFrameSearchrangeEnabled)// full frame search |
3521 | | { |
3522 | | srLeft = -1 * cuPelX; |
3523 | | srTop = -1 * cuPelY; |
3524 | | |
3525 | | srRight = picWidth - cuPelX - roiWidth; |
3526 | | srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight; |
3527 | | } |
3528 | | else |
3529 | | { |
3530 | | const uint32_t searchWidthInCTUs = 1 << cu.m_log2CUSize[0] == 8 ? 1 : (isFullFrameSearchrangeEnabled) ? -1 : 1; |
3531 | | uint32_t width = 0, maxWidth = searchWidthInCTUs * lcuWidth; |
3532 | | for (const CUData* pTestCU = cu.m_cuLeft; |
3533 | | width < maxWidth && pTestCU != NULL && pTestCU->m_slice != NULL; |
3534 | | pTestCU = pTestCU->m_cuLeft, width += lcuWidth) |
3535 | | { |
3536 | | } |
3537 | | int maxXsr = (cuPelX % lcuWidth) + X265_MIN(maxWidth, width); |
3538 | | int maxYsr = cuPelY % lcuHeight; |
3539 | | |
3540 | | if (cu.m_chromaFormat == X265_CSP_I420 || cu.m_chromaFormat == X265_CSP_I422) maxXsr &= ~0x4; |
3541 | | if (cu.m_chromaFormat == X265_CSP_I420) maxYsr &= ~0x4; |
3542 | | |
3543 | | srLeft = -maxXsr; |
3544 | | srTop = -maxYsr; |
3545 | | |
3546 | | srRight = lcuWidth - cuPelX % lcuWidth - roiWidth; |
3547 | | srBottom = lcuHeight - cuPelY % lcuHeight - roiHeight; |
3548 | | } |
3549 | | |
3550 | | if (cuPelX + srRight + roiWidth > picWidth) |
3551 | | { |
3552 | | srRight = picWidth % lcuWidth - cuPelX % lcuWidth - roiWidth; |
3553 | | } |
3554 | | if (cuPelY + srBottom + roiHeight > picHeight) |
3555 | | { |
3556 | | srBottom = picHeight % lcuHeight - cuPelY % lcuHeight - roiHeight; |
3557 | | } |
3558 | | |
3559 | | searchRangeLT.x = srLeft; |
3560 | | searchRangeLT.y = srTop; |
3561 | | searchRangeRB.x = srRight; |
3562 | | searchRangeRB.y = srBottom; |
3563 | | |
3564 | | cu.clipMv(searchRangeLT); |
3565 | | cu.clipMv(searchRangeRB); |
3566 | | |
3567 | | } |
3568 | | |
3569 | | void Search::intraBlockCopyEstimate(Mode& intraBCMode, const CUGeom& cuGeom, int puIdx, MV* pred, MV& mv, uint32_t& cost, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc) |
3570 | | { |
3571 | | uint32_t partAddr; |
3572 | | int roiWidth; |
3573 | | int roiHeight; |
3574 | | |
3575 | | MV searchRangeLT; |
3576 | | MV searchRangeRB; |
3577 | | MV mvPred = *pred; |
3578 | | const MV predictors = *pred; |
3579 | | |
3580 | | CUData& cu = intraBCMode.cu; |
3581 | | cu.getPartIndexAndSize(puIdx, partAddr, roiWidth, roiHeight); |
3582 | | |
3583 | | int ref = m_slice->m_numRefIdx[0] - 1; |
3584 | | pixel* refY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->getLumaAddr(cu.m_cuAddr, cu.m_absIdxInCTU + partAddr); |
3585 | | int strideY = m_slice->m_refFrameList[0][ref]->m_reconPic[1]->m_stride; |
3586 | | |
3587 | | setIntraSearchRange(intraBCMode, mvPred, puIdx, roiWidth, roiHeight, searchRangeLT, searchRangeRB); |
3588 | | |
3589 | | m_me.setMVP(predictors); |
3590 | | |
3591 | | intraPatternSearch(intraBCMode, cuGeom, puIdx, partAddr, refY, strideY, &searchRangeLT, &searchRangeRB, mv, cost, roiWidth, roiHeight, testOnlyPred, bUse1DSearchFor8x8, ibc); |
3592 | | } |
3593 | | |
3594 | | bool Search::predIntraBCSearch(Mode& intraBCMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, bool testOnlyPred, bool bUse1DSearchFor8x8, IBC& ibc) |
3595 | | { |
3596 | | MV zeroMv(0, 0); |
3597 | | CUData& cu = intraBCMode.cu; |
3598 | | Yuv* predYuv = &intraBCMode.predYuv; |
3599 | | Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; |
3600 | | int numPart = cu.getNumPartInter(0); |
3601 | | int log2ParallelMergeLevelMinus2 = 0; |
3602 | | |
3603 | | // 12 mv candidates including lowresMV |
3604 | | MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; |
3605 | | |
3606 | | if (m_param->bEnableSCC == 1 && (1 << cu.m_log2CUSize[0]) > SCM_S0067_MAX_CAND_SIZE) // fast search |
3607 | | return false; |
3608 | | |
3609 | | uint32_t totalCost = 0; |
3610 | | for (int puIdx = 0; puIdx < numPart; puIdx++) |
3611 | | { |
3612 | | int width, height; |
3613 | | uint32_t partAddr = 0; |
3614 | | MotionData* bestME = intraBCMode.bestME[puIdx]; |
3615 | | PredictionUnit pu(cu, cuGeom, puIdx); |
3616 | | MV mv, mvPred[2]; |
3617 | | cu.getPartIndexAndSize(puIdx, pu.puAbsPartIdx, width, height); |
3618 | | partAddr = pu.puAbsPartIdx; |
3619 | | m_me.setSourcePU(*intraBCMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC); |
3620 | | |
3621 | | cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, intraBCMode.interNeighbours); |
3622 | | cu.getPMV(intraBCMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, puIdx, pu.puAbsPartIdx); |
3623 | | |
3624 | | mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2); |
3625 | | mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2); |
3626 | | |
3627 | | uint32_t cost; |
3628 | | mv.set(0, 0); |
3629 | | intraBlockCopyEstimate(intraBCMode, cuGeom, puIdx, mvPred, mv, cost, testOnlyPred, bUse1DSearchFor8x8, ibc); |
3630 | | |
3631 | | bestME->mv.set(mv.x << 2, mv.y << 2); |
3632 | | bestME->cost = cost; |
3633 | | totalCost += cost; |
3634 | | if (mv.x == 0 && mv.y == 0) |
3635 | | { |
3636 | | if (testOnlyPred) |
3637 | | { |
3638 | | m_lastCandCost = MAX_UINT; |
3639 | | } |
3640 | | return false; |
3641 | | } |
3642 | | |
3643 | | int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp; |
3644 | | int distAMVPBest, distMergeTemp; |
3645 | | int costAMVPBest, costMergeBest, costMergeTemp; |
3646 | | bitsAMVPBest = MAX_INT; |
3647 | | costAMVPBest = MAX_INT; |
3648 | | costMergeBest = MAX_INT; |
3649 | | int mvpIdxBest = 0; |
3650 | | int mvpIdxTemp; |
3651 | | int mrgIdxBest = -1; |
3652 | | int mrgIdxTemp = -1; |
3653 | | int xCUStart = cu.m_cuPelX; |
3654 | | int yCUStart = cu.m_cuPelY; |
3655 | | int xStartInCU = 0, yStartInCU = 0; |
3656 | | if (ePartSize == SIZE_2Nx2N) |
3657 | | xStartInCU = yStartInCU = 0; |
3658 | | else if (ePartSize == SIZE_2NxN) |
3659 | | { |
3660 | | xStartInCU = 0; |
3661 | | yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx; |
3662 | | } |
3663 | | else if (ePartSize == SIZE_Nx2N) |
3664 | | { |
3665 | | xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * puIdx; |
3666 | | yStartInCU = 0; |
3667 | | } |
3668 | | const pixel* currStart; |
3669 | | pixel* ref; |
3670 | | int currStride, refStride; |
3671 | | distAMVPBest = 0; |
3672 | | |
3673 | | MV cMvQuaterPixl = mv; |
3674 | | cMvQuaterPixl <<= 2; |
3675 | | cu.setPUMv(0, cMvQuaterPixl, pu.puAbsPartIdx, puIdx); |
3676 | | cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, puIdx); |
3677 | | cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx); |
3678 | | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); |
3679 | | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); |
3680 | | motionCompensation(cu, pu, tmpPredYuv, 1, 1); |
3681 | | for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++) |
3682 | | { |
3683 | | int tempHeight, tempWidth; |
3684 | | if (ch == 0) |
3685 | | { |
3686 | | tempHeight = height; |
3687 | | tempWidth = width; |
3688 | | ref = tmpPredYuv.getLumaAddr(partAddr); |
3689 | | refStride = tmpPredYuv.m_size; |
3690 | | distAMVPBest += m_me.bufSAD(ref, refStride); |
3691 | | } |
3692 | | else |
3693 | | { |
3694 | | tempHeight = height >> m_vChromaShift; |
3695 | | tempWidth = width >> m_hChromaShift; |
3696 | | |
3697 | | currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr); |
3698 | | currStride = intraBCMode.fencYuv->m_csize; |
3699 | | ref = tmpPredYuv.getChromaAddr(ch, partAddr); |
3700 | | refStride = tmpPredYuv.m_csize; |
3701 | | distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight); |
3702 | | } |
3703 | | } |
3704 | | |
3705 | | mvPred[0].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0].y >> 2); |
3706 | | mvPred[1].set(intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].x >> 2, intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1].y >> 2); |
3707 | | |
3708 | | for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++) |
3709 | | { |
3710 | | m_me.setMVP(mvPred[mvpIdxTemp]); |
3711 | | bitsAMVPTemp = m_me.bitcost(mv, mvPred[mvpIdxTemp]); |
3712 | | if (bitsAMVPTemp < bitsAMVPBest) |
3713 | | { |
3714 | | bitsAMVPBest = bitsAMVPTemp; |
3715 | | mvpIdxBest = mvpIdxTemp; |
3716 | | } |
3717 | | } |
3718 | | |
3719 | | bitsAMVPBest++; // for MVP Index bits |
3720 | | costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest); |
3721 | | |
3722 | | MVField cMvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists |
3723 | | uint8_t uhInterDirNeighbours[MRG_MAX_NUM_CANDS]; |
3724 | | int numValidMergeCand = 0; |
3725 | | |
3726 | | for (int i = 0; i < MRG_MAX_NUM_CANDS; i++) |
3727 | | { |
3728 | | cMvFieldNeighbours[i][0].mv.set(0, 0); |
3729 | | cMvFieldNeighbours[i][0].refIdx = REF_NOT_VALID; |
3730 | | } |
3731 | | |
3732 | | if (ePartSize != SIZE_2Nx2N) |
3733 | | { |
3734 | | if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8) |
3735 | | { |
3736 | | cu.setPartSizeSubParts(SIZE_2Nx2N); |
3737 | | if (puIdx == 0) |
3738 | | { |
3739 | | numValidMergeCand = cu.getInterMergeCandidates(0, 0, cMvFieldNeighbours, uhInterDirNeighbours); |
3740 | | } |
3741 | | cu.setPartSizeSubParts(ePartSize); |
3742 | | } |
3743 | | else |
3744 | | { |
3745 | | numValidMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, cMvFieldNeighbours, uhInterDirNeighbours); |
3746 | | } |
3747 | | |
3748 | | cu.roundMergeCandidates(cMvFieldNeighbours, numValidMergeCand); |
3749 | | restrictBipredMergeCand(&cu, puIdx, cMvFieldNeighbours, uhInterDirNeighbours, numValidMergeCand); |
3750 | | |
3751 | | for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCand; mrgIdxTemp++) |
3752 | | { |
3753 | | if (uhInterDirNeighbours[mrgIdxTemp] != 1) |
3754 | | { |
3755 | | continue; |
3756 | | } |
3757 | | if (m_slice->m_refPOCList[0][cMvFieldNeighbours[mrgIdxTemp][0].refIdx] != m_slice->m_poc) |
3758 | | { |
3759 | | continue; |
3760 | | } |
3761 | | |
3762 | | if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, width, height, &cu, |
3763 | | xStartInCU, yStartInCU, (cMvFieldNeighbours[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighbours[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize)) |
3764 | | { |
3765 | | continue; |
3766 | | } |
3767 | | bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1; |
3768 | | |
3769 | | distMergeTemp = 0; |
3770 | | |
3771 | | cu.setPUMv(0, cMvFieldNeighbours[mrgIdxTemp][0].mv, pu.puAbsPartIdx, puIdx); |
3772 | | cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, puIdx); |
3773 | | cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, puIdx); |
3774 | | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); |
3775 | | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); |
3776 | | motionCompensation(cu, pu, tmpPredYuv, 1, 1); |
3777 | | |
3778 | | for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++) |
3779 | | { |
3780 | | int tempHeight, tempWidth; |
3781 | | if (ch == 0) |
3782 | | { |
3783 | | tempHeight = height; |
3784 | | tempWidth = width; |
3785 | | ref = tmpPredYuv.getLumaAddr(partAddr); |
3786 | | refStride = tmpPredYuv.m_size; |
3787 | | distMergeTemp += m_me.bufSAD(ref, refStride); |
3788 | | } |
3789 | | else |
3790 | | { |
3791 | | tempHeight = height >> m_vChromaShift; |
3792 | | tempWidth = width >> m_hChromaShift; |
3793 | | |
3794 | | currStart = intraBCMode.fencYuv->getChromaAddr(ch, partAddr); |
3795 | | currStride = intraBCMode.fencYuv->m_csize; |
3796 | | ref = tmpPredYuv.getChromaAddr(ch, partAddr); |
3797 | | refStride = tmpPredYuv.m_csize; |
3798 | | distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight); |
3799 | | } |
3800 | | } |
3801 | | costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp); |
3802 | | |
3803 | | if (costMergeTemp < costMergeBest) |
3804 | | { |
3805 | | costMergeBest = costMergeTemp; |
3806 | | mrgIdxBest = mrgIdxTemp; |
3807 | | } |
3808 | | } |
3809 | | } |
3810 | | if (costAMVPBest < costMergeBest) |
3811 | | { |
3812 | | MV tempmv((mv.x << 2), (mv.y << 2)); |
3813 | | MVField mvField[2]; |
3814 | | mvField[0].mv = tempmv; |
3815 | | mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1; // the current picture is at the last position of list0 |
3816 | | mvField[1].mv = zeroMv; |
3817 | | mvField[1].refIdx = REF_NOT_VALID; |
3818 | | |
3819 | | cu.m_mergeFlag[pu.puAbsPartIdx] = false; |
3820 | | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); // list 0 prediction |
3821 | | |
3822 | | cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx); |
3823 | | cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx); |
3824 | | cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx); |
3825 | | cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx); |
3826 | | |
3827 | | MV mvd; |
3828 | | mvd.set(mv.x - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].x >> 2), mv.y - (intraBCMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][mvpIdxBest].y >> 2)); |
3829 | | |
3830 | | cu.m_mvd[0][pu.puAbsPartIdx] = mvd; |
3831 | | cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mvpIdxBest; |
3832 | | } |
3833 | | else |
3834 | | { |
3835 | | MV MV(cMvFieldNeighbours[mrgIdxBest][0].mv.x, cMvFieldNeighbours[mrgIdxBest][0].mv.y); |
3836 | | MVField mvField[2]; |
3837 | | mvField[0].mv = MV; |
3838 | | mvField[0].refIdx = cu.m_slice->m_numRefIdx[0] - 1; // the current picture is at the last position of list0 |
3839 | | mvField[1].mv = zeroMv; |
3840 | | mvField[1].refIdx = REF_NOT_VALID; |
3841 | | |
3842 | | cu.m_mergeFlag[pu.puAbsPartIdx] = true; |
3843 | | cu.m_mvpIdx[0][pu.puAbsPartIdx] = (uint8_t)mrgIdxBest; /* merge candidate ID is stored in L0 MVP idx */ |
3844 | | cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); // list 0 prediction |
3845 | | |
3846 | | cu.setPUMv(0, mvField[0].mv, pu.puAbsPartIdx, puIdx); |
3847 | | cu.setPURefIdx(0, (int8_t)mvField[0].refIdx, pu.puAbsPartIdx, puIdx); |
3848 | | cu.setPUMv(1, mvField[1].mv, pu.puAbsPartIdx, puIdx); |
3849 | | cu.setPURefIdx(1, (int8_t)mvField[1].refIdx, pu.puAbsPartIdx, puIdx); |
3850 | | |
3851 | | cu.m_mvd[0][pu.puAbsPartIdx] = zeroMv; |
3852 | | cu.m_mvd[1][pu.puAbsPartIdx] = zeroMv; |
3853 | | } |
3854 | | motionCompensation(cu, pu, *predYuv, 1, 1); |
3855 | | } |
3856 | | |
3857 | | PredictionUnit pu(cu, cuGeom, 0); |
3858 | | uint32_t abortThreshold = (1 << cu.m_log2CUSize[0]) * (1 << cu.m_log2CUSize[0]) * 2; |
3859 | | if (testOnlyPred) |
3860 | | { |
3861 | | if (numPart == 1 && totalCost > abortThreshold) |
3862 | | { |
3863 | | m_lastCandCost = MAX_UINT; |
3864 | | return false; |
3865 | | } |
3866 | | m_lastCandCost = totalCost; |
3867 | | } |
3868 | | else if (totalCost < abortThreshold && 3 * totalCost >> 2 >= m_lastCandCost) |
3869 | | { |
3870 | | return false; |
3871 | | } |
3872 | | return true; |
3873 | | } |
3874 | | |
3875 | | bool Search::predMixedIntraBCInterSearch(Mode& intraBCMixedMode, const CUGeom& cuGeom, bool bChromaMC, PartSize ePartSize, MV* iMvCandList) |
3876 | | { |
3877 | | intraBCMixedMode.initCosts(); |
3878 | | intraBCMixedMode.cu.setPartSizeSubParts(ePartSize); |
3879 | | intraBCMixedMode.cu.setPredModeSubParts(MODE_INTER); |
3880 | | CUData& cu = intraBCMixedMode.cu; |
3881 | | int numComb = 2; |
3882 | | int numPart = 2; |
3883 | | uint32_t cost[2] = { 0,0 }; |
3884 | | uint32_t maxCost = UINT32_MAX; |
3885 | | |
3886 | | int numPredDir = m_slice->isInterP() ? 1 : 2; |
3887 | | MV cMvZero(0, 0); |
3888 | | |
3889 | | MV cMvPredCand[2][2]; |
3890 | | int IBCValidFlag = 0; |
3891 | | int bestIBCMvpIdx[2] = { 0, 0 }; |
3892 | | int bestInterMvpIdx[2] = { 0, 0 }; |
3893 | | int bestInterDir[2] = { 0, 0 }; |
3894 | | int bestRefIdx[2] = { 0, 0 }; |
3895 | | bool isMergeMode[2] = { false, false }; |
3896 | | bool isIBCMergeMode[2] = { false, false }; |
3897 | | MVField cMRGMvField[2][2]; |
3898 | | MVField cMRGMvFieldIBC[2][2]; |
3899 | | int log2ParallelMergeLevelMinus2 = 0; |
3900 | | // 12 mv candidates including lowresMV |
3901 | | MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; |
3902 | | |
3903 | | Yuv* predYuv = &intraBCMixedMode.predYuv; |
3904 | | Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; |
3905 | | |
3906 | | for (int combo = 0; combo < numComb; combo++) // number of combination |
3907 | | { |
3908 | | for (int partIdx = 0; partIdx < numPart; ++partIdx) |
3909 | | { |
3910 | | int dummyWidth, dummyHeight; |
3911 | | uint32_t partAddr = 0; |
3912 | | PredictionUnit pu(cu, cuGeom, partIdx); |
3913 | | cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth, dummyHeight); |
3914 | | m_me.setSourcePU(*intraBCMixedMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC); |
3915 | | |
3916 | | MV mvPred[2]; |
3917 | | MV bvPred[2]; |
3918 | | if ((combo == 0 && partIdx == 0) || (combo == 1 && partIdx == 1)) // intraBC |
3919 | | { |
3920 | | MV cMv = iMvCandList[8 + partIdx]; |
3921 | | if (cMv.x == 0 && cMv.y == 0) |
3922 | | { |
3923 | | cost[combo] = maxCost; |
3924 | | IBCValidFlag++; |
3925 | | break; |
3926 | | } |
3927 | | |
3928 | | cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours); |
3929 | | cu.getPMV(intraBCMixedMode.interNeighbours, 0, m_slice->m_numRefIdx[0] - 1, intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1], mvc, partIdx, pu.puAbsPartIdx); |
3930 | | |
3931 | | bvPred[0] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][0]; |
3932 | | bvPred[1] = intraBCMixedMode.amvpCand[0][m_slice->m_numRefIdx[0] - 1][1]; |
3933 | | bvPred[0] >>= 2; |
3934 | | bvPred[1] >>= 2; |
3935 | | |
3936 | | ///////////////////////////////////////////////////////////// |
3937 | | // ibc merge |
3938 | | // choose one MVP and compare with merge mode |
3939 | | |
3940 | | int bitsAMVPBest, bitsAMVPTemp, bitsMergeTemp; |
3941 | | int distAMVPBest, distMergeTemp; |
3942 | | int costAMVPBest, costMergeBest, costMergeTemp; |
3943 | | bitsAMVPBest = MAX_INT; |
3944 | | costAMVPBest = MAX_INT; |
3945 | | costMergeBest = MAX_INT; |
3946 | | int mvpIdxBest = 0; |
3947 | | int mvpIdxTemp; |
3948 | | int mrgIdxBest = -1; |
3949 | | int mrgIdxTemp = -1; |
3950 | | int xCUStart = cu.m_cuPelX; |
3951 | | int yCUStart = cu.m_cuPelY; |
3952 | | int xStartInCU = 0, yStartInCU = 0; |
3953 | | if (ePartSize == SIZE_2Nx2N) |
3954 | | xStartInCU = yStartInCU = 0; |
3955 | | else if (ePartSize == SIZE_2NxN) |
3956 | | { |
3957 | | xStartInCU = 0; |
3958 | | yStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx; |
3959 | | } |
3960 | | else if (ePartSize == SIZE_Nx2N) |
3961 | | { |
3962 | | xStartInCU = (1 << cu.m_log2CUSize[0]) / 2 * partIdx; |
3963 | | yStartInCU = 0; |
3964 | | } |
3965 | | const pixel* currStart; |
3966 | | int currStride; |
3967 | | int refStride; |
3968 | | distAMVPBest = 0; |
3969 | | pixel* ref; |
3970 | | |
3971 | | cu.setPUMv(0, cMv, pu.puAbsPartIdx, partIdx); |
3972 | | cu.setPURefIdx(0, (int8_t)m_slice->m_numRefIdx[0] - 1, pu.puAbsPartIdx, partIdx); |
3973 | | cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx); |
3974 | | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
3975 | | cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx); |
3976 | | motionCompensation(cu, pu, tmpPredYuv, 1, 1); |
3977 | | |
3978 | | for (uint32_t ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++) |
3979 | | { |
3980 | | int tempHeight, tempWidth; |
3981 | | if (ch == 0) |
3982 | | { |
3983 | | tempHeight = dummyHeight; |
3984 | | tempWidth = dummyWidth; |
3985 | | ref = tmpPredYuv.getLumaAddr(partAddr); |
3986 | | refStride = tmpPredYuv.m_size; |
3987 | | distAMVPBest += m_me.bufSAD(ref, refStride); |
3988 | | } |
3989 | | else |
3990 | | { |
3991 | | tempHeight = dummyHeight >> m_vChromaShift; |
3992 | | tempWidth = dummyWidth >> m_hChromaShift; |
3993 | | |
3994 | | currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr); |
3995 | | currStride = intraBCMixedMode.fencYuv->m_csize; |
3996 | | ref = tmpPredYuv.getChromaAddr(ch, partAddr); |
3997 | | refStride = tmpPredYuv.m_csize; |
3998 | | distAMVPBest += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight); |
3999 | | } |
4000 | | } |
4001 | | |
4002 | | MV check; |
4003 | | for (mvpIdxTemp = 0; mvpIdxTemp < AMVP_NUM_CANDS; mvpIdxTemp++) |
4004 | | { |
4005 | | m_me.setMVP(bvPred[mvpIdxTemp]); |
4006 | | bitsAMVPTemp = m_me.bitcost(cMv >> 2, bvPred[mvpIdxTemp]); |
4007 | | if (bitsAMVPTemp < bitsAMVPBest) |
4008 | | { |
4009 | | bitsAMVPBest = bitsAMVPTemp; |
4010 | | mvpIdxBest = mvpIdxTemp; |
4011 | | } |
4012 | | } |
4013 | | |
4014 | | bitsAMVPBest++; // for MVP Index bits |
4015 | | costAMVPBest = distAMVPBest + m_rdCost.getCost(bitsAMVPBest); |
4016 | | |
4017 | | MVField cMvFieldNeighboursIBC[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists |
4018 | | uint8_t uhInterDirNeighboursIBC[MRG_MAX_NUM_CANDS]; |
4019 | | int numValidMergeCandIBC = 0; |
4020 | | |
4021 | | if (ePartSize != SIZE_2Nx2N) |
4022 | | { |
4023 | | if (log2ParallelMergeLevelMinus2 && ePartSize != SIZE_2Nx2N && 1 << cu.m_log2CUSize[0] >= 8) |
4024 | | { |
4025 | | cu.setPartSizeSubParts(SIZE_2Nx2N); |
4026 | | if (partIdx == 0) |
4027 | | { |
4028 | | numValidMergeCandIBC = cu.getInterMergeCandidates(0, 0, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC); |
4029 | | } |
4030 | | cu.setPartSizeSubParts(ePartSize); |
4031 | | } |
4032 | | else |
4033 | | { |
4034 | | numValidMergeCandIBC = cu.getInterMergeCandidates(pu.puAbsPartIdx, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC); |
4035 | | } |
4036 | | |
4037 | | cu.roundMergeCandidates(cMvFieldNeighboursIBC, numValidMergeCandIBC); |
4038 | | restrictBipredMergeCand(&cu, partIdx, cMvFieldNeighboursIBC, uhInterDirNeighboursIBC, numValidMergeCandIBC); |
4039 | | |
4040 | | for (mrgIdxTemp = 0; mrgIdxTemp < numValidMergeCandIBC; mrgIdxTemp++) |
4041 | | { |
4042 | | if (uhInterDirNeighboursIBC[mrgIdxTemp] != 1) |
4043 | | { |
4044 | | continue; |
4045 | | } |
4046 | | if (m_slice->m_refPOCList[0][cMvFieldNeighboursIBC[mrgIdxTemp][0].refIdx] != m_slice->m_poc) |
4047 | | { |
4048 | | continue; |
4049 | | } |
4050 | | |
4051 | | if (!isBlockVectorValid(xCUStart + xStartInCU, yCUStart + yStartInCU, dummyWidth, dummyHeight, &cu, |
4052 | | xStartInCU, yStartInCU, (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.x >> 2), (cMvFieldNeighboursIBC[mrgIdxTemp][0].mv.y >> 2), m_param->maxCUSize)) |
4053 | | { |
4054 | | continue; |
4055 | | } |
4056 | | bitsMergeTemp = mrgIdxTemp == (int)m_param->maxNumMergeCand ? mrgIdxTemp : mrgIdxTemp + 1; |
4057 | | |
4058 | | distMergeTemp = 0; |
4059 | | cu.setPUMv(0, cMvFieldNeighboursIBC[mrgIdxTemp][0].mv, pu.puAbsPartIdx, partIdx); |
4060 | | cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx); |
4061 | | cu.setPUMv(1, MV(0, 0), pu.puAbsPartIdx, partIdx); |
4062 | | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
4063 | | cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx); |
4064 | | motionCompensation(cu, pu, tmpPredYuv, 1, 1); |
4065 | | |
4066 | | for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++) |
4067 | | { |
4068 | | int tempHeight, tempWidth; |
4069 | | if (ch == 0) |
4070 | | { |
4071 | | tempHeight = dummyHeight; |
4072 | | tempWidth = dummyWidth; |
4073 | | ref = tmpPredYuv.getLumaAddr(partAddr); |
4074 | | refStride = tmpPredYuv.m_size; |
4075 | | distMergeTemp += m_me.bufSAD(ref, refStride); |
4076 | | } |
4077 | | else |
4078 | | { |
4079 | | tempHeight = dummyHeight >> m_vChromaShift; |
4080 | | tempWidth = dummyWidth >> m_hChromaShift; |
4081 | | |
4082 | | currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr); |
4083 | | currStride = intraBCMixedMode.fencYuv->m_csize; |
4084 | | ref = tmpPredYuv.getChromaAddr(ch, partAddr); |
4085 | | refStride = tmpPredYuv.m_csize; |
4086 | | distMergeTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight); |
4087 | | } |
4088 | | } |
4089 | | costMergeTemp = distMergeTemp + m_rdCost.getCost(bitsMergeTemp); |
4090 | | |
4091 | | if (costMergeTemp < costMergeBest) |
4092 | | { |
4093 | | costMergeBest = costMergeTemp; |
4094 | | mrgIdxBest = mrgIdxTemp; |
4095 | | } |
4096 | | } |
4097 | | } |
4098 | | |
4099 | | if (costMergeBest < costAMVPBest) |
4100 | | { |
4101 | | cost[combo] += costMergeBest; |
4102 | | isIBCMergeMode[combo] = true; |
4103 | | bestIBCMvpIdx[combo] = mrgIdxBest; |
4104 | | |
4105 | | MVField mvField[2]; |
4106 | | MV mv(cMvFieldNeighboursIBC[mrgIdxBest][0].mv.x, cMvFieldNeighboursIBC[mrgIdxBest][0].mv.y); |
4107 | | mvField[0].mv = mv; |
4108 | | mvField[0].refIdx = m_slice->m_numRefIdx[0] - 1; // the current picture is at the last position of list0 |
4109 | | mvField[1].mv = cMvZero; |
4110 | | mvField[1].refIdx = REF_NOT_VALID; |
4111 | | cMRGMvFieldIBC[combo][0] = mvField[0]; |
4112 | | cMRGMvFieldIBC[combo][1] = mvField[1]; |
4113 | | } |
4114 | | else |
4115 | | { |
4116 | | cost[combo] += costAMVPBest; |
4117 | | isIBCMergeMode[combo] = false; |
4118 | | bestIBCMvpIdx[combo] = mvpIdxBest; |
4119 | | cMvPredCand[combo][partIdx].set(bvPred[mvpIdxBest].x << 2, bvPred[mvpIdxBest].y << 2); |
4120 | | } |
4121 | | |
4122 | | cu.setPUInterDir(1, pu.puAbsPartIdx, partIdx); // list 0 prediction |
4123 | | if (isIBCMergeMode[combo]) |
4124 | | { |
4125 | | cu.setPUMv(0, cMRGMvFieldIBC[combo][0].mv, pu.puAbsPartIdx, partIdx); |
4126 | | } |
4127 | | else |
4128 | | { |
4129 | | cu.setPUMv(0, iMvCandList[8 + partIdx], pu.puAbsPartIdx, partIdx); |
4130 | | cu.setPURefIdx(0, (int8_t)(m_slice->m_numRefIdx[0] - 1), pu.puAbsPartIdx, partIdx); |
4131 | | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
4132 | | } |
4133 | | // ibc merge |
4134 | | ///////////////////////////////////////////////////////////// |
4135 | | } |
4136 | | else // is inter PU |
4137 | | { |
4138 | | uint32_t costInterTemp = 0; |
4139 | | uint32_t costInterBest = UINT32_MAX; |
4140 | | const pixel* currStart; |
4141 | | int currStride; |
4142 | | pixel* ref; |
4143 | | int refStride; |
4144 | | MergeData merge; |
4145 | | memset(&merge, 0, sizeof(merge)); |
4146 | | for (int refList = 0; refList < numPredDir; refList++) |
4147 | | { |
4148 | | uint32_t numRef = refList ? ((m_slice->m_numRefIdx[1] > 1) ? 2 : 1) : ((m_slice->m_numRefIdx[0] - 1 > 1) ? 2 : 1); |
4149 | | for (uint32_t refIdx = 0; refIdx < numRef; refIdx++) |
4150 | | { |
4151 | | MV cMv = iMvCandList[4 * refList + 2 * refIdx + partIdx]; |
4152 | | |
4153 | | cu.getNeighbourMV(partIdx, pu.puAbsPartIdx, intraBCMixedMode.interNeighbours); |
4154 | | cu.getPMV(intraBCMixedMode.interNeighbours, refList, refIdx, intraBCMixedMode.amvpCand[refList][refIdx], mvc, partIdx, pu.puAbsPartIdx); |
4155 | | int mvpIdx; |
4156 | | |
4157 | | uint32_t tempCost0 = 0; |
4158 | | uint32_t tempCost1 = 0; |
4159 | | mvPred[0] = intraBCMixedMode.amvpCand[refList][refIdx][0]; |
4160 | | mvPred[1] = intraBCMixedMode.amvpCand[refList][refIdx][1]; |
4161 | | |
4162 | | m_me.setMVP(mvPred[0]); |
4163 | | tempCost0 = m_me.bitcost(cMv, mvPred[0]); |
4164 | | m_me.setMVP(mvPred[1]); |
4165 | | tempCost1 = m_me.bitcost(cMv, mvPred[1]); |
4166 | | if (tempCost1 < tempCost0) |
4167 | | { |
4168 | | mvpIdx = 1; |
4169 | | } |
4170 | | else |
4171 | | { |
4172 | | mvpIdx = 0; |
4173 | | } |
4174 | | uint32_t bitsTemp = m_listSelBits[refList] + MVP_IDX_BITS; |
4175 | | bitsTemp += getTUBits(refIdx, numRef); |
4176 | | |
4177 | | m_me.setMVP(mvPred[mvpIdx]); |
4178 | | if (cu.m_slice->m_useIntegerMv) |
4179 | | { |
4180 | | cu.setPUMv(refList, (cMv >> 2) << 2, pu.puAbsPartIdx, partIdx); |
4181 | | } |
4182 | | else |
4183 | | { |
4184 | | cu.setPUMv(refList, cMv, pu.puAbsPartIdx, partIdx); |
4185 | | } |
4186 | | cu.setPURefIdx(refList, refIdx, pu.puAbsPartIdx, partIdx); |
4187 | | cu.setPUInterDir(1 + refList, pu.puAbsPartIdx, partIdx); |
4188 | | motionCompensation(cu, pu, tmpPredYuv, 1, 1); |
4189 | | |
4190 | | costInterTemp = 0; |
4191 | | for (int ch = TEXT_LUMA; ch < MAX_NUM_COMPONENT; ch++) |
4192 | | { |
4193 | | int tempHeight, tempWidth; |
4194 | | if (ch == 0) |
4195 | | { |
4196 | | tempHeight = dummyHeight; |
4197 | | tempWidth = dummyWidth; |
4198 | | ref = tmpPredYuv.getLumaAddr(partAddr); |
4199 | | refStride = tmpPredYuv.m_size; |
4200 | | costInterTemp += m_me.bufSAD(ref, refStride); |
4201 | | } |
4202 | | else |
4203 | | { |
4204 | | tempHeight = dummyHeight >> m_vChromaShift; |
4205 | | tempWidth = dummyWidth >> m_hChromaShift; |
4206 | | |
4207 | | currStart = intraBCMixedMode.fencYuv->getChromaAddr(ch, partAddr); |
4208 | | currStride = intraBCMixedMode.fencYuv->m_csize; |
4209 | | ref = tmpPredYuv.getChromaAddr(ch, partAddr); |
4210 | | refStride = tmpPredYuv.m_csize; |
4211 | | costInterTemp += getSAD(ref, refStride, currStart, currStride, tempWidth, tempHeight); |
4212 | | } |
4213 | | |
4214 | | if (costInterTemp >= costInterBest) |
4215 | | { |
4216 | | break; |
4217 | | } |
4218 | | } |
4219 | | cu.setPURefIdx(refList, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
4220 | | |
4221 | | costInterTemp += m_me.bitcost(cMv, mvPred[mvpIdx]); |
4222 | | costInterTemp += m_rdCost.getCost(bitsTemp); |
4223 | | |
4224 | | if (costInterTemp < costInterBest) |
4225 | | { |
4226 | | costInterBest = costInterTemp; |
4227 | | bestInterMvpIdx[combo] = mvpIdx; |
4228 | | bestInterDir[combo] = refList; |
4229 | | bestRefIdx[combo] = refIdx; |
4230 | | cMvPredCand[combo][partIdx] = mvPred[mvpIdx]; |
4231 | | } |
4232 | | } |
4233 | | } // end RefIdx and RefList search |
4234 | | |
4235 | | uint32_t MRGInterDir = 0; |
4236 | | uint32_t MRGIndex = 0; |
4237 | | |
4238 | | // find Merge result |
4239 | | uint32_t MRGCost = UINT32_MAX; |
4240 | | cu.m_mergeFlag[pu.puAbsPartIdx] = true; |
4241 | | |
4242 | | mergeEstimation(cu, cuGeom, pu, partIdx, merge); |
4243 | | MRGInterDir = merge.dir; |
4244 | | cMRGMvField[combo][0] = merge.mvField[0]; |
4245 | | cMRGMvField[combo][1] = merge.mvField[1]; |
4246 | | MRGIndex = merge.index; |
4247 | | cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
4248 | | cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
4249 | | |
4250 | | if (MRGCost < costInterBest) |
4251 | | { |
4252 | | costInterBest = MRGCost; |
4253 | | isMergeMode[combo] = true; |
4254 | | bestInterMvpIdx[combo] = MRGIndex; |
4255 | | bestInterDir[combo] = MRGInterDir; |
4256 | | } |
4257 | | |
4258 | | cost[combo] += costInterBest; |
4259 | | if (isMergeMode[combo]) |
4260 | | { |
4261 | | cu.setPUInterDir(bestInterDir[combo], pu.puAbsPartIdx, partIdx); |
4262 | | cu.setPUMv(0, cMRGMvField[combo][0].mv, pu.puAbsPartIdx, partIdx); |
4263 | | cu.setPURefIdx(0, cMRGMvField[combo][0].refIdx, pu.puAbsPartIdx, partIdx); |
4264 | | cu.setPUMv(1, cMRGMvField[combo][1].mv, pu.puAbsPartIdx, partIdx); |
4265 | | cu.setPURefIdx(1, cMRGMvField[combo][1].refIdx, pu.puAbsPartIdx, partIdx); |
4266 | | } |
4267 | | else |
4268 | | { |
4269 | | int refListOpt = bestInterDir[combo]; |
4270 | | int refIdxOpt = bestRefIdx[combo]; |
4271 | | if (cu.m_slice->m_useIntegerMv) |
4272 | | { |
4273 | | cu.setPUMv(refListOpt, (iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, pu.puAbsPartIdx, partIdx); |
4274 | | } |
4275 | | else |
4276 | | { |
4277 | | cu.setPUMv(refListOpt, iMvCandList[partIdx + 2 * refIdxOpt + 4 * refListOpt], pu.puAbsPartIdx, partIdx); |
4278 | | } |
4279 | | cu.setPURefIdx(refListOpt, refIdxOpt, pu.puAbsPartIdx, partIdx); |
4280 | | cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, pu.puAbsPartIdx, partIdx); |
4281 | | cu.setPUInterDir(1 + refListOpt, pu.puAbsPartIdx, partIdx); |
4282 | | cu.m_mvpIdx[refListOpt][pu.puAbsPartIdx] = bestInterMvpIdx[combo]; |
4283 | | } |
4284 | | } |
4285 | | } // for ipartIdx |
4286 | | } // for combo |
4287 | | |
4288 | | if (IBCValidFlag > 1) |
4289 | | { |
4290 | | return false; |
4291 | | } |
4292 | | |
4293 | | MV cMvd; |
4294 | | MV cMVFinal; |
4295 | | if (cost[0] <= cost[1]) |
4296 | | { |
4297 | | int iDummyWidth1, iDummyHeight1; |
4298 | | uint32_t partAddr = 0; |
4299 | | uint32_t partIdx = 0; |
4300 | | cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1); |
4301 | | |
4302 | | if (isIBCMergeMode[0]) |
4303 | | { |
4304 | | cu.m_mergeFlag[partAddr] = true; |
4305 | | cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0]; |
4306 | | cu.setPUInterDir(1, partAddr, partIdx); // list 0 prediction |
4307 | | cu.setPUMv(0, cMRGMvFieldIBC[0][0].mv, partAddr, partIdx); |
4308 | | cu.setPURefIdx(0, cMRGMvFieldIBC[0][0].refIdx, partAddr, partIdx); |
4309 | | cu.setPUMv(1, cMRGMvFieldIBC[0][1].mv, partAddr, partIdx); |
4310 | | cu.setPURefIdx(1, cMRGMvFieldIBC[0][1].refIdx, partAddr, partIdx); |
4311 | | |
4312 | | cu.m_mvd[0][partAddr] = cMvZero; |
4313 | | cu.m_mvd[1][partAddr] = cMvZero; |
4314 | | } |
4315 | | else |
4316 | | { |
4317 | | cu.m_mergeFlag[partAddr] = false; |
4318 | | |
4319 | | cMvd.set((iMvCandList[8].x - cMvPredCand[0][0].x) >> 2, (iMvCandList[8].y - cMvPredCand[0][0].y) >> 2); |
4320 | | cu.setPUMv(0, iMvCandList[8], partAddr, partIdx); |
4321 | | cu.m_mvd[0][partAddr] = cMvd; |
4322 | | cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[0]; |
4323 | | cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx); |
4324 | | cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx); |
4325 | | cu.setPUInterDir(1, partAddr, partIdx); // list 0 prediction |
4326 | | } |
4327 | | |
4328 | | partIdx = 1; |
4329 | | cu.getPartIndexAndSize(partIdx, partAddr, iDummyWidth1, iDummyHeight1); |
4330 | | |
4331 | | if (isMergeMode[0]) |
4332 | | { |
4333 | | cu.m_mergeFlag[partAddr] = true; |
4334 | | cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[0]; |
4335 | | cu.setPUInterDir(bestInterDir[0], partAddr, partIdx); // list 0 prediction |
4336 | | cu.setPUMv(0, cMRGMvField[0][0].mv, partAddr, partIdx); |
4337 | | cu.setPURefIdx(0, cMRGMvField[0][0].refIdx, partAddr, partIdx); |
4338 | | cu.setPUMv(1, cMRGMvField[0][1].mv, partAddr, partIdx); |
4339 | | cu.setPURefIdx(1, cMRGMvField[0][1].refIdx, partAddr, partIdx); |
4340 | | |
4341 | | cu.m_mvd[0][partAddr] = cMvZero; |
4342 | | cu.m_mvd[1][partAddr] = cMvZero; |
4343 | | } |
4344 | | else |
4345 | | { |
4346 | | int refListOpt = bestInterDir[0]; |
4347 | | int refIdxOpt = bestRefIdx[0]; |
4348 | | if (cu.m_slice->m_useIntegerMv) |
4349 | | { |
4350 | | cMvd.set(((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[0][1].x >> 2)), ((iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[0][1].y >> 2))); |
4351 | | cu.setPUMv(refListOpt, (iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx); |
4352 | | } |
4353 | | else |
4354 | | { |
4355 | | cMvd.set(iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[0][1].x, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[0][1].y); |
4356 | | cu.setPUMv(refListOpt, iMvCandList[1 + 2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx); |
4357 | | } |
4358 | | cu.m_mvd[refListOpt][partAddr] = cMvd; |
4359 | | cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx); |
4360 | | cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx); |
4361 | | cu.setPUInterDir(1 + refListOpt, partAddr, partIdx); |
4362 | | cu.m_mergeFlag[partAddr] = false; |
4363 | | cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[0]; |
4364 | | } |
4365 | | } |
4366 | | else |
4367 | | { |
4368 | | int dummyWidth2, dummyHeight2; |
4369 | | uint32_t partAddr = 0; |
4370 | | uint32_t partIdx = 0; |
4371 | | |
4372 | | cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2); |
4373 | | |
4374 | | if (isMergeMode[1]) |
4375 | | { |
4376 | | cu.m_mergeFlag[partAddr] = true; |
4377 | | cu.m_mvpIdx[0][partAddr] = bestInterMvpIdx[1]; |
4378 | | cu.setPUInterDir(bestInterDir[1], partAddr, partIdx); // list 0 prediction |
4379 | | cu.setPUMv(0, cMRGMvField[1][0].mv, partAddr, partIdx); |
4380 | | cu.setPURefIdx(0, cMRGMvField[1][0].refIdx, partAddr, partIdx); |
4381 | | cu.setPUMv(1, cMRGMvField[1][1].mv, partAddr, partIdx); |
4382 | | cu.setPURefIdx(1, cMRGMvField[1][1].refIdx, partAddr, partIdx); |
4383 | | |
4384 | | cu.m_mvd[0][partAddr] = cMvZero; |
4385 | | cu.m_mvd[1][partAddr] = cMvZero; |
4386 | | } |
4387 | | else |
4388 | | { |
4389 | | int refListOpt = bestInterDir[1]; |
4390 | | int refIdxOpt = bestRefIdx[1]; |
4391 | | if (cu.m_slice->m_useIntegerMv) |
4392 | | { |
4393 | | cMvd.set((iMvCandList[2 * refIdxOpt + 4 * refListOpt].x >> 2) - (cMvPredCand[1][0].x >> 2), (iMvCandList[2 * refIdxOpt + 4 * refListOpt].y >> 2) - (cMvPredCand[1][0].y >> 2)); |
4394 | | cu.setPUMv(refListOpt, (iMvCandList[2 * refIdxOpt + 4 * refListOpt] >> 2) << 2, partAddr, partIdx); |
4395 | | } |
4396 | | else |
4397 | | { |
4398 | | cMvd.set(iMvCandList[2 * refIdxOpt + 4 * refListOpt].x - cMvPredCand[1][0].x, iMvCandList[2 * refIdxOpt + 4 * refListOpt].y - cMvPredCand[1][0].y); |
4399 | | cu.setPUMv(refListOpt, iMvCandList[2 * refIdxOpt + 4 * refListOpt], partAddr, partIdx); |
4400 | | } |
4401 | | cu.m_mvd[refListOpt][partAddr] = cMvd; |
4402 | | cu.setPURefIdx(refListOpt, refIdxOpt, partAddr, partIdx); |
4403 | | cu.setPURefIdx(1 - refListOpt, REF_NOT_VALID, partAddr, partIdx); |
4404 | | cu.setPUInterDir(1 + refListOpt, partAddr, partIdx); |
4405 | | cu.m_mergeFlag[partAddr] = false; |
4406 | | cu.m_mvpIdx[refListOpt][partAddr] = bestInterMvpIdx[1]; |
4407 | | } |
4408 | | |
4409 | | partIdx = 1; |
4410 | | cu.getPartIndexAndSize(partIdx, partAddr, dummyWidth2, dummyHeight2); |
4411 | | |
4412 | | if (isIBCMergeMode[1]) |
4413 | | { |
4414 | | cu.m_mergeFlag[partAddr] = true; |
4415 | | cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1]; |
4416 | | cu.setPUInterDir(1, partAddr, partIdx); // list 0 prediction |
4417 | | cu.setPUMv(0, cMRGMvFieldIBC[1][0].mv, partAddr, partIdx); |
4418 | | cu.setPURefIdx(0, cMRGMvFieldIBC[1][0].refIdx, partAddr, partIdx); |
4419 | | cu.setPUMv(1, cMRGMvFieldIBC[1][1].mv, partAddr, partIdx); |
4420 | | cu.setPURefIdx(1, cMRGMvFieldIBC[1][1].refIdx, partAddr, partIdx); |
4421 | | |
4422 | | cu.m_mvd[0][partAddr] = cMvZero; |
4423 | | cu.m_mvd[1][partAddr] = cMvZero; |
4424 | | } |
4425 | | else |
4426 | | { |
4427 | | cu.m_mergeFlag[partAddr] = false; |
4428 | | |
4429 | | cMvd.set(((iMvCandList[9].x - cMvPredCand[1][1].x) >> 2), (iMvCandList[9].y - cMvPredCand[1][1].y) >> 2); |
4430 | | cu.setPUMv(0, iMvCandList[9], partAddr, partIdx); |
4431 | | cu.m_mvd[0][partAddr] = cMvd; |
4432 | | cu.m_mvpIdx[0][partAddr] = bestIBCMvpIdx[1]; |
4433 | | cu.setPURefIdx(0, m_slice->m_numRefIdx[0] - 1, partAddr, partIdx); |
4434 | | cu.setPURefIdx(1, REF_NOT_VALID, partAddr, partIdx); |
4435 | | cu.setPUInterDir(1, partAddr, partIdx); // list 0 prediction |
4436 | | } |
4437 | | } |
4438 | | for (int partIdx = 0; partIdx < numPart; ++partIdx) |
4439 | | { |
4440 | | PredictionUnit pu(cu, cuGeom, partIdx); |
4441 | | motionCompensation(cu, pu, *predYuv, 1, 1); |
4442 | | } |
4443 | | |
4444 | | return true; |
4445 | | } |
4446 | | #endif |
4447 | | |
4448 | | void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]) |
4449 | 0 | { |
4450 | 0 | if (cuMode == SIZE_2Nx2N) |
4451 | 0 | { |
4452 | 0 | blockBit[0] = (!bPSlice) ? 3 : 1; |
4453 | 0 | blockBit[1] = 3; |
4454 | 0 | blockBit[2] = 5; |
4455 | 0 | } |
4456 | 0 | else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD) |
4457 | 0 | { |
4458 | 0 | static const uint32_t listBits[2][3][3] = |
4459 | 0 | { |
4460 | 0 | { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } }, |
4461 | 0 | { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } } |
4462 | 0 | }; |
4463 | 0 | if (bPSlice) |
4464 | 0 | { |
4465 | 0 | blockBit[0] = 3; |
4466 | 0 | blockBit[1] = 0; |
4467 | 0 | blockBit[2] = 0; |
4468 | 0 | } |
4469 | 0 | else |
4470 | 0 | memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t)); |
4471 | 0 | } |
4472 | 0 | else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N) |
4473 | 0 | { |
4474 | 0 | static const uint32_t listBits[2][3][3] = |
4475 | 0 | { |
4476 | 0 | { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } }, |
4477 | 0 | { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } } |
4478 | 0 | }; |
4479 | 0 | if (bPSlice) |
4480 | 0 | { |
4481 | 0 | blockBit[0] = 3; |
4482 | 0 | blockBit[1] = 0; |
4483 | 0 | blockBit[2] = 0; |
4484 | 0 | } |
4485 | 0 | else |
4486 | 0 | memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t)); |
4487 | 0 | } |
4488 | 0 | else if (cuMode == SIZE_NxN) |
4489 | 0 | { |
4490 | 0 | blockBit[0] = (!bPSlice) ? 3 : 1; |
4491 | 0 | blockBit[1] = 3; |
4492 | 0 | blockBit[2] = 5; |
4493 | 0 | } |
4494 | 0 | else |
4495 | 0 | { |
4496 | 0 | X265_CHECK(0, "getBlkBits: unknown cuMode\n"); |
4497 | 0 | } |
4498 | 0 | } |
4499 | | |
4500 | | /* Check if using an alternative MVP would result in a smaller MVD + signal bits */ |
4501 | | const MV& Search::checkBestMVP(const MV* amvpCand, const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const |
4502 | 0 | { |
4503 | 0 | int diffBits = m_me.bitcost(mv, amvpCand[!mvpIdx]) - m_me.bitcost(mv, amvpCand[mvpIdx]); |
4504 | 0 | if (diffBits < 0) |
4505 | 0 | { |
4506 | 0 | mvpIdx = !mvpIdx; |
4507 | 0 | uint32_t origOutBits = outBits; |
4508 | 0 | outBits = origOutBits + diffBits; |
4509 | 0 | outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits); |
4510 | 0 | } |
4511 | 0 | return amvpCand[mvpIdx]; |
4512 | 0 | } |
4513 | | |
4514 | | /* Update to default MVP when using an alternative mvp */ |
4515 | | void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP) |
4516 | 0 | { |
4517 | 0 | int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP); |
4518 | 0 | uint32_t origOutBits = outBits; |
4519 | 0 | outBits = origOutBits + diffBits; |
4520 | 0 | outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits); |
4521 | 0 | } |
4522 | | |
4523 | | void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const |
4524 | 0 | { |
4525 | 0 | MV dist((int32_t)merange << 2, (int32_t)merange << 2); |
4526 | 0 | mvmin = mvp - dist; |
4527 | 0 | mvmax = mvp + dist; |
4528 | |
|
4529 | 0 | if (m_vertRestriction) |
4530 | 0 | { |
4531 | 0 | int mvRestricted = (56 - 1) << 2; // -1 to consider subpel search |
4532 | 0 | if (mvmax.y >= mvRestricted) |
4533 | 0 | { |
4534 | 0 | mvmax.y = mvRestricted; //only positive side is restricted |
4535 | 0 | } |
4536 | 0 | } |
4537 | |
|
4538 | 0 | cu.clipMv(mvmin); |
4539 | 0 | cu.clipMv(mvmax); |
4540 | |
|
4541 | 0 | if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && |
4542 | 0 | cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol && |
4543 | 0 | m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth) |
4544 | 0 | { |
4545 | 0 | int safeX, maxSafeMv; |
4546 | 0 | safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; |
4547 | 0 | maxSafeMv = (safeX - cu.m_cuPelX) * 4; |
4548 | 0 | mvmax.x = X265_MIN(mvmax.x, maxSafeMv); |
4549 | 0 | mvmin.x = X265_MIN(mvmin.x, maxSafeMv); |
4550 | 0 | } |
4551 | | |
4552 | | // apply restrict on slices |
4553 | 0 | if ((m_param->maxSlices > 1) & m_bFrameParallel) |
4554 | 0 | { |
4555 | 0 | mvmin.y = X265_MAX(mvmin.y, m_sliceMinY); |
4556 | 0 | mvmax.y = X265_MIN(mvmax.y, m_sliceMaxY); |
4557 | 0 | } |
4558 | | |
4559 | | /* Clip search range to signaled maximum MV length. |
4560 | | * We do not support this VUI field being changed from the default */ |
4561 | 0 | const int maxMvLen = (1 << 15) - 1; |
4562 | 0 | mvmin.x = X265_MAX(mvmin.x, -maxMvLen); |
4563 | 0 | mvmin.y = X265_MAX(mvmin.y, -maxMvLen); |
4564 | 0 | mvmax.x = X265_MIN(mvmax.x, maxMvLen); |
4565 | 0 | mvmax.y = X265_MIN(mvmax.y, maxMvLen); |
4566 | |
|
4567 | 0 | mvmin >>= 2; |
4568 | 0 | mvmax >>= 2; |
4569 | | |
4570 | | /* conditional clipping for frame parallelism */ |
4571 | 0 | mvmin.y = X265_MIN(mvmin.y, (int32_t)m_refLagPixels); |
4572 | 0 | mvmax.y = X265_MIN(mvmax.y, (int32_t)m_refLagPixels); |
4573 | | |
4574 | | /* conditional clipping for negative mv range */ |
4575 | 0 | mvmax.y = X265_MAX(mvmax.y, mvmin.y); |
4576 | 0 | } |
4577 | | |
4578 | | /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ |
4579 | | void Search::encodeResAndCalcRdSkipCU(Mode& interMode) |
4580 | 0 | { |
4581 | 0 | CUData& cu = interMode.cu; |
4582 | 0 | Yuv* reconYuv = &interMode.reconYuv; |
4583 | 0 | const Yuv* fencYuv = interMode.fencYuv; |
4584 | 0 | Yuv* predYuv = &interMode.predYuv; |
4585 | 0 | X265_CHECK(!cu.isIntra(0), "intra CU not expected\n"); |
4586 | 0 | uint32_t depth = cu.m_cuDepth[0]; |
4587 | | |
4588 | | // No residual coding : SKIP mode |
4589 | |
|
4590 | 0 | cu.setPredModeSubParts(MODE_SKIP); |
4591 | 0 | cu.clearCbf(); |
4592 | 0 | cu.setTUDepthSubParts(0, 0, depth); |
4593 | |
|
4594 | 0 | reconYuv->copyFromYuv(interMode.predYuv); |
4595 | | |
4596 | | // Luma |
4597 | 0 | int part = partitionFromLog2Size(cu.m_log2CUSize[0]); |
4598 | 0 | interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
4599 | 0 | interMode.distortion = interMode.lumaDistortion; |
4600 | | // Chroma |
4601 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4602 | 0 | { |
4603 | 0 | interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); |
4604 | 0 | interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); |
4605 | 0 | interMode.distortion += interMode.chromaDistortion; |
4606 | 0 | } |
4607 | 0 | cu.m_distortion[0] = interMode.distortion; |
4608 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
4609 | 0 | m_entropyCoder.resetBits(); |
4610 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
4611 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); |
4612 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
4613 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
4614 | 0 | m_entropyCoder.codeMergeIndex(cu, 0); |
4615 | 0 | interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
4616 | 0 | interMode.coeffBits = 0; |
4617 | 0 | interMode.totalBits = interMode.mvBits + skipFlagBits; |
4618 | 0 | if (m_rdCost.m_psyRd) |
4619 | 0 | interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
4620 | 0 | else if(m_rdCost.m_ssimRd) |
4621 | 0 | interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0); |
4622 | |
|
4623 | 0 | interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
4624 | 0 | updateModeCost(interMode); |
4625 | 0 | m_entropyCoder.store(interMode.contexts); |
4626 | 0 | } |
4627 | | |
4628 | | /* encode residual and calculate rate-distortion for a CU block. |
4629 | | * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ |
4630 | | void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) |
4631 | 0 | { |
4632 | 0 | ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); |
4633 | |
|
4634 | 0 | CUData& cu = interMode.cu; |
4635 | 0 | Yuv* reconYuv = &interMode.reconYuv; |
4636 | 0 | Yuv* predYuv = &interMode.predYuv; |
4637 | 0 | uint32_t depth = cuGeom.depth; |
4638 | 0 | ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv; |
4639 | 0 | const Yuv* fencYuv = interMode.fencYuv; |
4640 | |
|
4641 | 0 | X265_CHECK(!cu.isIntra(0), "intra CU not expected\n"); |
4642 | |
|
4643 | 0 | uint32_t log2CUSize = cuGeom.log2CUSize; |
4644 | 0 | int sizeIdx = log2CUSize - 2; |
4645 | |
|
4646 | 0 | resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp); |
4647 | |
|
4648 | 0 | uint32_t tuDepthRange[2]; |
4649 | 0 | cu.getInterTUQtDepthRange(tuDepthRange, 0); |
4650 | |
|
4651 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
4652 | |
|
4653 | 0 | if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH)) |
4654 | 0 | m_maxTUDepth = -1; |
4655 | 0 | else if (m_limitTU & X265_TU_LIMIT_BFS) |
4656 | 0 | memset(&m_cacheTU, 0, sizeof(TUInfoCache)); |
4657 | |
|
4658 | 0 | Cost costs; |
4659 | 0 | if (m_limitTU & X265_TU_LIMIT_NEIGH) |
4660 | 0 | { |
4661 | | /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */ |
4662 | 0 | int32_t tempDepth = m_maxTUDepth; |
4663 | 0 | if (m_maxTUDepth != -1) |
4664 | 0 | { |
4665 | 0 | uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N; |
4666 | 0 | uint32_t minSize = tuDepthRange[0]; |
4667 | 0 | uint32_t maxSize = tuDepthRange[1]; |
4668 | 0 | maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag); |
4669 | 0 | m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth); |
4670 | 0 | } |
4671 | 0 | estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange); |
4672 | 0 | m_maxTUDepth = tempDepth; |
4673 | 0 | } |
4674 | 0 | else |
4675 | 0 | estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange); |
4676 | |
|
4677 | 0 | uint32_t tqBypass = cu.m_tqBypass[0]; |
4678 | 0 | if (!tqBypass) |
4679 | 0 | { |
4680 | 0 | sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
4681 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4682 | 0 | { |
4683 | 0 | cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); |
4684 | 0 | cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); |
4685 | 0 | } |
4686 | | |
4687 | | /* Consider the RD cost of not signaling any residual */ |
4688 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
4689 | 0 | m_entropyCoder.resetBits(); |
4690 | 0 | m_entropyCoder.codeQtRootCbfZero(); |
4691 | 0 | uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits(); |
4692 | |
|
4693 | 0 | uint32_t cbf0Energy; uint64_t cbf0Cost; |
4694 | 0 | if (m_rdCost.m_psyRd) |
4695 | 0 | { |
4696 | 0 | cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
4697 | 0 | cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy); |
4698 | 0 | } |
4699 | 0 | else if(m_rdCost.m_ssimRd) |
4700 | 0 | { |
4701 | 0 | cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0); |
4702 | 0 | cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy); |
4703 | 0 | } |
4704 | 0 | else |
4705 | 0 | cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits); |
4706 | |
|
4707 | 0 | if (cbf0Cost < costs.rdcost) |
4708 | 0 | { |
4709 | 0 | cu.clearCbf(); |
4710 | 0 | cu.setTUDepthSubParts(0, 0, depth); |
4711 | 0 | } |
4712 | 0 | } |
4713 | |
|
4714 | 0 | if (cu.getQtRootCbf(0)) |
4715 | 0 | saveResidualQTData(cu, *resiYuv, 0, 0); |
4716 | | |
4717 | | /* calculate signal bits for inter/merge/skip coded CU */ |
4718 | 0 | m_entropyCoder.load(m_rqt[depth].cur); |
4719 | |
|
4720 | 0 | m_entropyCoder.resetBits(); |
4721 | 0 | if (m_slice->m_pps->bTransquantBypassEnabled) |
4722 | 0 | m_entropyCoder.codeCUTransquantBypassFlag(tqBypass); |
4723 | |
|
4724 | 0 | uint32_t coeffBits, bits, mvBits; |
4725 | 0 | if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) |
4726 | 0 | { |
4727 | 0 | cu.setPredModeSubParts(MODE_SKIP); |
4728 | | |
4729 | | /* Merge/Skip */ |
4730 | 0 | coeffBits = mvBits = 0; |
4731 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
4732 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
4733 | 0 | m_entropyCoder.codeMergeIndex(cu, 0); |
4734 | 0 | mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
4735 | 0 | bits = mvBits + skipFlagBits; |
4736 | 0 | } |
4737 | 0 | else |
4738 | 0 | { |
4739 | 0 | m_entropyCoder.codeSkipFlag(cu, 0); |
4740 | 0 | int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits(); |
4741 | 0 | m_entropyCoder.codePredMode(cu.m_predMode[0]); |
4742 | 0 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); |
4743 | 0 | m_entropyCoder.codePredInfo(cu, 0); |
4744 | 0 | mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits; |
4745 | |
|
4746 | 0 | bool bCodeDQP = m_slice->m_pps->bUseDQP; |
4747 | 0 | m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); |
4748 | 0 | bits = m_entropyCoder.getNumberOfWrittenBits(); |
4749 | |
|
4750 | 0 | coeffBits = bits - mvBits - skipFlagBits; |
4751 | 0 | } |
4752 | |
|
4753 | 0 | m_entropyCoder.store(interMode.contexts); |
4754 | |
|
4755 | 0 | if (cu.getQtRootCbf(0)) |
4756 | 0 | reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp); |
4757 | 0 | else |
4758 | 0 | reconYuv->copyFromYuv(*predYuv); |
4759 | | |
4760 | | // update with clipped distortion and cost (qp estimation loop uses unclipped values) |
4761 | 0 | sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
4762 | 0 | interMode.distortion = bestLumaDist; |
4763 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4764 | 0 | { |
4765 | 0 | sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); |
4766 | 0 | bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); |
4767 | 0 | interMode.chromaDistortion = bestChromaDist; |
4768 | 0 | interMode.distortion += bestChromaDist; |
4769 | 0 | } |
4770 | 0 | if (m_rdCost.m_psyRd) |
4771 | 0 | interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); |
4772 | 0 | else if(m_rdCost.m_ssimRd) |
4773 | 0 | interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0); |
4774 | |
|
4775 | 0 | interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); |
4776 | 0 | interMode.totalBits = bits; |
4777 | 0 | interMode.lumaDistortion = bestLumaDist; |
4778 | 0 | interMode.coeffBits = coeffBits; |
4779 | 0 | interMode.mvBits = mvBits; |
4780 | 0 | cu.m_distortion[0] = interMode.distortion; |
4781 | 0 | updateModeCost(interMode); |
4782 | 0 | checkDQP(interMode, cuGeom); |
4783 | |
|
4784 | | #if ENABLE_SCC_EXT |
4785 | | if (m_param->bEnableSCC) |
4786 | | interMode.reconYuv.copyToPicYuv(*m_frame->m_reconPic[1], cu.m_cuAddr, cuGeom.absPartIdx); |
4787 | | #endif |
4788 | 0 | } |
4789 | | |
4790 | | void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]) |
4791 | 0 | { |
4792 | 0 | uint32_t depth = cuGeom.depth + tuDepth; |
4793 | 0 | CUData& cu = mode.cu; |
4794 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
4795 | |
|
4796 | 0 | bool bCheckFull = log2TrSize <= depthRange[1]; |
4797 | 0 | if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && log2TrSize > depthRange[0]) |
4798 | 0 | bCheckFull = false; |
4799 | |
|
4800 | 0 | if (bCheckFull) |
4801 | 0 | { |
4802 | | // code full block |
4803 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
4804 | 0 | uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0; |
4805 | |
|
4806 | 0 | uint32_t tuDepthC = tuDepth; |
4807 | 0 | if (log2TrSizeC < 2) |
4808 | 0 | { |
4809 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
4810 | 0 | log2TrSizeC = 2; |
4811 | 0 | tuDepthC--; |
4812 | 0 | codeChroma &= !(absPartIdx & 3); |
4813 | 0 | } |
4814 | |
|
4815 | 0 | uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2; |
4816 | 0 | uint32_t setCbf = 1 << tuDepth; |
4817 | |
|
4818 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
4819 | 0 | coeff_t* coeffCurY = cu.m_trCoeff[0] + coeffOffsetY; |
4820 | |
|
4821 | 0 | uint32_t sizeIdx = log2TrSize - 2; |
4822 | |
|
4823 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); |
4824 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); |
4825 | |
|
4826 | 0 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; |
4827 | 0 | const Yuv* fencYuv = mode.fencYuv; |
4828 | |
|
4829 | 0 | int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx); |
4830 | 0 | uint32_t strideResiY = resiYuv.m_size; |
4831 | |
|
4832 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
4833 | 0 | uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
4834 | |
|
4835 | 0 | if (numSigY) |
4836 | 0 | { |
4837 | 0 | m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY); |
4838 | 0 | cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth); |
4839 | 0 | } |
4840 | 0 | else |
4841 | 0 | { |
4842 | 0 | primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0); |
4843 | 0 | cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth); |
4844 | 0 | } |
4845 | |
|
4846 | 0 | if (codeChroma) |
4847 | 0 | { |
4848 | 0 | uint32_t sizeIdxC = log2TrSizeC - 2; |
4849 | 0 | uint32_t strideResiC = resiYuv.m_csize; |
4850 | |
|
4851 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
4852 | 0 | coeff_t* coeffCurU = cu.m_trCoeff[1] + coeffOffsetC; |
4853 | 0 | coeff_t* coeffCurV = cu.m_trCoeff[2] + coeffOffsetC; |
4854 | 0 | bool splitIntoSubTUs = (m_csp == X265_CSP_I422); |
4855 | |
|
4856 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
4857 | 0 | do |
4858 | 0 | { |
4859 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
4860 | 0 | uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); |
4861 | |
|
4862 | 0 | cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); |
4863 | 0 | cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); |
4864 | |
|
4865 | 0 | int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC); |
4866 | 0 | const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC); |
4867 | 0 | uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false); |
4868 | 0 | if (numSigU) |
4869 | 0 | { |
4870 | 0 | m_quant.invtransformNxN(cu, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU); |
4871 | 0 | cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); |
4872 | 0 | } |
4873 | 0 | else |
4874 | 0 | { |
4875 | 0 | primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0); |
4876 | 0 | cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep); |
4877 | 0 | } |
4878 | |
|
4879 | 0 | int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC); |
4880 | 0 | const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC); |
4881 | 0 | uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false); |
4882 | 0 | if (numSigV) |
4883 | 0 | { |
4884 | 0 | m_quant.invtransformNxN(cu, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV); |
4885 | 0 | cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); |
4886 | 0 | } |
4887 | 0 | else |
4888 | 0 | { |
4889 | 0 | primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0); |
4890 | 0 | cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); |
4891 | 0 | } |
4892 | 0 | } |
4893 | 0 | while (tuIterator.isNextSection()); |
4894 | |
|
4895 | 0 | if (splitIntoSubTUs) |
4896 | 0 | { |
4897 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
4898 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
4899 | 0 | } |
4900 | 0 | } |
4901 | 0 | } |
4902 | 0 | else |
4903 | 0 | { |
4904 | 0 | X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n"); |
4905 | |
|
4906 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
4907 | 0 | uint32_t ycbf = 0, ucbf = 0, vcbf = 0; |
4908 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
4909 | 0 | { |
4910 | 0 | residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange); |
4911 | 0 | ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
4912 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4913 | 0 | { |
4914 | 0 | ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
4915 | 0 | vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
4916 | 0 | } |
4917 | 0 | } |
4918 | 0 | cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth; |
4919 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4920 | 0 | { |
4921 | 0 | cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth; |
4922 | 0 | cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth; |
4923 | 0 | } |
4924 | 0 | } |
4925 | 0 | } |
4926 | | |
4927 | | uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t energy, uint32_t tuDepth, TextType compId) |
4928 | 0 | { |
4929 | 0 | uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth); |
4930 | |
|
4931 | 0 | if (m_rdCost.m_psyRd) |
4932 | 0 | return m_rdCost.calcPsyRdCost(dist, nullBits, energy); |
4933 | 0 | else if(m_rdCost.m_ssimRd) |
4934 | 0 | return m_rdCost.calcSsimRdCost(dist, nullBits, energy); |
4935 | 0 | else |
4936 | 0 | return m_rdCost.calcRdCost(dist, nullBits); |
4937 | 0 | } |
4938 | | |
4939 | | bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore) |
4940 | 0 | { |
4941 | 0 | CUData& cu = mode.cu; |
4942 | 0 | uint32_t depth = cuGeom.depth + tuDepth; |
4943 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
4944 | |
|
4945 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
4946 | 0 | uint32_t ycbf = 0, ucbf = 0, vcbf = 0; |
4947 | 0 | for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) |
4948 | 0 | { |
4949 | 0 | if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1) |
4950 | 0 | { |
4951 | 0 | m_maxTUDepth = cu.m_tuDepth[0]; |
4952 | | // Fetch maximum TU depth of first sub partition to limit recursion of others |
4953 | 0 | for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++) |
4954 | 0 | m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]); |
4955 | 0 | } |
4956 | 0 | estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore); |
4957 | 0 | ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); |
4958 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4959 | 0 | { |
4960 | 0 | ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); |
4961 | 0 | vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); |
4962 | 0 | } |
4963 | 0 | } |
4964 | 0 | cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth; |
4965 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
4966 | 0 | { |
4967 | 0 | cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth; |
4968 | 0 | cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth; |
4969 | 0 | } |
4970 | | |
4971 | | // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits |
4972 | | // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma. |
4973 | | // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context |
4974 | | // at depth 0 (for example). |
4975 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
4976 | 0 | m_entropyCoder.resetBits(); |
4977 | 0 | codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange); |
4978 | 0 | uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits(); |
4979 | 0 | splitCost.bits += splitCbfBits; |
4980 | |
|
4981 | 0 | if (m_rdCost.m_psyRd) |
4982 | 0 | splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
4983 | 0 | else if(m_rdCost.m_ssimRd) |
4984 | 0 | splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); |
4985 | 0 | else |
4986 | 0 | splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits); |
4987 | | |
4988 | 0 | return ycbf || ucbf || vcbf; |
4989 | 0 | } |
4990 | | |
4991 | | void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore) |
4992 | 0 | { |
4993 | 0 | CUData& cu = mode.cu; |
4994 | 0 | uint32_t depth = cuGeom.depth + tuDepth; |
4995 | 0 | uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth; |
4996 | 0 | bool bEnableRDOQ = !!m_param->rdoqLevel; |
4997 | |
|
4998 | 0 | bool bCheckSplit = log2TrSize > depthRange[0]; |
4999 | 0 | bool bCheckFull = log2TrSize <= depthRange[1]; |
5000 | 0 | bool bSaveTUData = false, bLoadTUData = false; |
5001 | 0 | uint32_t idx = 0; |
5002 | |
|
5003 | 0 | if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0) |
5004 | 0 | { |
5005 | 0 | if (bCheckSplit && bCheckFull && tuDepth) |
5006 | 0 | { |
5007 | 0 | uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2; |
5008 | 0 | uint32_t qIdx = (absPartIdx / qNumParts) % 4; |
5009 | 0 | idx = (depth - 1) * 4 + qIdx; |
5010 | 0 | if (splitMore) |
5011 | 0 | { |
5012 | 0 | bLoadTUData = true; |
5013 | 0 | bCheckFull = false; |
5014 | 0 | } |
5015 | 0 | else |
5016 | 0 | { |
5017 | 0 | bSaveTUData = true; |
5018 | 0 | bCheckSplit = false; |
5019 | 0 | } |
5020 | 0 | } |
5021 | 0 | } |
5022 | 0 | else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH) |
5023 | 0 | { |
5024 | 0 | if (bCheckSplit && m_maxTUDepth >= 0) |
5025 | 0 | { |
5026 | 0 | uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth; |
5027 | 0 | bCheckSplit = log2TrSize > log2MaxTrSize; |
5028 | 0 | } |
5029 | 0 | } |
5030 | |
|
5031 | 0 | bool bSplitPresentFlag = bCheckSplit && bCheckFull; |
5032 | |
|
5033 | 0 | if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit) |
5034 | 0 | bCheckFull = false; |
5035 | |
|
5036 | 0 | X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n"); |
5037 | |
|
5038 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
5039 | 0 | uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0; |
5040 | 0 | uint32_t tuDepthC = tuDepth; |
5041 | 0 | if (log2TrSizeC < 2) |
5042 | 0 | { |
5043 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
5044 | 0 | log2TrSizeC = 2; |
5045 | 0 | tuDepthC--; |
5046 | 0 | codeChroma &= !(absPartIdx & 3); |
5047 | 0 | } |
5048 | | |
5049 | | // code full block |
5050 | 0 | Cost fullCost; |
5051 | 0 | fullCost.rdcost = MAX_INT64; |
5052 | |
|
5053 | 0 | uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; |
5054 | 0 | uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; |
5055 | 0 | uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
5056 | 0 | sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
5057 | 0 | uint32_t singleEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
5058 | 0 | uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; |
5059 | 0 | uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} }; |
5060 | |
|
5061 | 0 | m_entropyCoder.store(m_rqt[depth].rqtRoot); |
5062 | |
|
5063 | 0 | uint32_t trSize = 1 << log2TrSize; |
5064 | 0 | const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); |
5065 | 0 | uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2; |
5066 | 0 | const Yuv* fencYuv = mode.fencYuv; |
5067 | | |
5068 | | // code full block |
5069 | 0 | if (bCheckFull) |
5070 | 0 | { |
5071 | 0 | uint32_t trSizeC = 1 << log2TrSizeC; |
5072 | 0 | int partSize = partitionFromLog2Size(log2TrSize); |
5073 | 0 | int partSizeC = partitionFromLog2Size(log2TrSizeC); |
5074 | 0 | const uint32_t qtLayer = log2TrSize - 2; |
5075 | 0 | uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); |
5076 | 0 | coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
5077 | |
|
5078 | 0 | bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0]; |
5079 | 0 | bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE; |
5080 | 0 | bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE; |
5081 | |
|
5082 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); |
5083 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); |
5084 | |
|
5085 | 0 | if (bEnableRDOQ) |
5086 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
5087 | |
|
5088 | 0 | const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); |
5089 | 0 | int16_t* resi = resiYuv.getLumaAddr(absPartIdx); |
5090 | 0 | numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); |
5091 | 0 | cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0]; |
5092 | |
|
5093 | 0 | m_entropyCoder.resetBits(); |
5094 | |
|
5095 | 0 | if (bSplitPresentFlag && log2TrSize > depthRange[0]) |
5096 | 0 | m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); |
5097 | |
|
5098 | 0 | if (cbfFlag[TEXT_LUMA][0]) |
5099 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); |
5100 | 0 | singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits(); |
5101 | |
|
5102 | 0 | X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n"); |
5103 | | |
5104 | | //Assuming zero residual |
5105 | 0 | sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size); |
5106 | 0 | uint32_t zeroEnergyY = 0; |
5107 | 0 | if (m_rdCost.m_psyRd) |
5108 | 0 | zeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size); |
5109 | 0 | else if(m_rdCost.m_ssimRd) |
5110 | 0 | zeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size, log2TrSize, TEXT_LUMA, absPartIdx); |
5111 | |
|
5112 | 0 | int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); |
5113 | 0 | uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size; |
5114 | |
|
5115 | 0 | if (cbfFlag[TEXT_LUMA][0]) |
5116 | 0 | { |
5117 | 0 | m_quant.invtransformNxN(cu, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only |
5118 | | |
5119 | | // non-zero cost calculation for luma - This is an approximation |
5120 | | // finally we have to encode correct cbf after comparing with null cost |
5121 | 0 | pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx); |
5122 | 0 | bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0; |
5123 | 0 | uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size; |
5124 | 0 | bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
5125 | 0 | bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0; |
5126 | 0 | bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0); |
5127 | 0 | primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY); |
5128 | |
|
5129 | 0 | const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY); |
5130 | 0 | uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); |
5131 | 0 | uint32_t nonZeroEnergyY = 0; uint64_t singleCostY = 0; |
5132 | 0 | if (m_rdCost.m_psyRd) |
5133 | 0 | { |
5134 | 0 | nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY); |
5135 | 0 | singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY); |
5136 | 0 | } |
5137 | 0 | else if(m_rdCost.m_ssimRd) |
5138 | 0 | { |
5139 | 0 | nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, curReconY, strideReconY, log2TrSize, TEXT_LUMA, absPartIdx); |
5140 | 0 | singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroEnergyY); |
5141 | 0 | } |
5142 | 0 | else |
5143 | 0 | singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]); |
5144 | |
|
5145 | 0 | if (cu.m_tqBypass[0]) |
5146 | 0 | { |
5147 | 0 | singleDist[TEXT_LUMA][0] = nonZeroDistY; |
5148 | 0 | singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY; |
5149 | 0 | } |
5150 | 0 | else |
5151 | 0 | { |
5152 | | // zero-cost calculation for luma. This is an approximation |
5153 | | // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf. |
5154 | | // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma. |
5155 | 0 | uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA); |
5156 | |
|
5157 | 0 | if (nullCostY < singleCostY) |
5158 | 0 | { |
5159 | 0 | cbfFlag[TEXT_LUMA][0] = 0; |
5160 | 0 | singleBits[TEXT_LUMA][0] = 0; |
5161 | 0 | primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0); |
5162 | | #if CHECKED_BUILD || _DEBUG |
5163 | | uint32_t numCoeffY = 1 << (log2TrSize << 1); |
5164 | | memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY); |
5165 | | #endif |
5166 | 0 | if (checkTransformSkipY) |
5167 | 0 | minCost[TEXT_LUMA][0] = nullCostY; |
5168 | 0 | singleDist[TEXT_LUMA][0] = zeroDistY; |
5169 | 0 | singleEnergy[TEXT_LUMA][0] = zeroEnergyY; |
5170 | 0 | } |
5171 | 0 | else |
5172 | 0 | { |
5173 | 0 | if (checkTransformSkipY) |
5174 | 0 | minCost[TEXT_LUMA][0] = singleCostY; |
5175 | 0 | singleDist[TEXT_LUMA][0] = nonZeroDistY; |
5176 | 0 | singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY; |
5177 | 0 | } |
5178 | 0 | } |
5179 | 0 | } |
5180 | 0 | else |
5181 | 0 | { |
5182 | 0 | if (checkTransformSkipY) |
5183 | 0 | minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA); |
5184 | 0 | primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0); |
5185 | 0 | singleDist[TEXT_LUMA][0] = zeroDistY; |
5186 | 0 | singleBits[TEXT_LUMA][0] = 0; |
5187 | 0 | singleEnergy[TEXT_LUMA][0] = zeroEnergyY; |
5188 | 0 | } |
5189 | |
|
5190 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); |
5191 | |
|
5192 | 0 | if (codeChroma) |
5193 | 0 | { |
5194 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
5195 | 0 | uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; |
5196 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
5197 | 0 | { |
5198 | 0 | sse_t zeroDistC = 0; |
5199 | 0 | uint32_t zeroEnergyC = 0; |
5200 | 0 | coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
5201 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
5202 | |
|
5203 | 0 | do |
5204 | 0 | { |
5205 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
5206 | 0 | uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); |
5207 | |
|
5208 | 0 | cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
5209 | |
|
5210 | 0 | if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) |
5211 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); |
5212 | |
|
5213 | 0 | fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); |
5214 | 0 | resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
5215 | 0 | numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); |
5216 | 0 | cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; |
5217 | |
|
5218 | 0 | uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits(); |
5219 | 0 | if (cbfFlag[chromaId][tuIterator.section]) |
5220 | 0 | m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); |
5221 | |
|
5222 | 0 | singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount; |
5223 | |
|
5224 | 0 | int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); |
5225 | 0 | zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize)); |
5226 | | |
5227 | | // Assuming zero residual |
5228 | 0 | if (m_rdCost.m_psyRd) |
5229 | 0 | zeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize); |
5230 | 0 | else if(m_rdCost.m_ssimRd) |
5231 | 0 | zeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize, log2TrSizeC, (TextType)chromaId, absPartIdxC); |
5232 | |
|
5233 | 0 | if (cbfFlag[chromaId][tuIterator.section]) |
5234 | 0 | { |
5235 | 0 | m_quant.invtransformNxN(cu, curResiC, strideResiC, coeffCurC + subTUOffset, |
5236 | 0 | log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); |
5237 | | |
5238 | | // non-zero cost calculation for luma, same as luma - This is an approximation |
5239 | | // finally we have to encode correct cbf after comparing with null cost |
5240 | 0 | pixel* curReconC = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); |
5241 | 0 | uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize; |
5242 | 0 | bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
5243 | 0 | bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
5244 | 0 | bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
5245 | 0 | bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0); |
5246 | 0 | primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC); |
5247 | 0 | sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC)); |
5248 | 0 | uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); |
5249 | 0 | uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0; |
5250 | 0 | if (m_rdCost.m_psyRd) |
5251 | 0 | { |
5252 | 0 | nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC); |
5253 | 0 | singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
5254 | 0 | } |
5255 | 0 | else if(m_rdCost.m_ssimRd) |
5256 | 0 | { |
5257 | 0 | nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, fencYuv->m_csize, curReconC, strideReconC, log2TrSizeC, (TextType)chromaId, absPartIdxC); |
5258 | 0 | singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
5259 | 0 | } |
5260 | 0 | else |
5261 | 0 | singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]); |
5262 | |
|
5263 | 0 | if (cu.m_tqBypass[0]) |
5264 | 0 | { |
5265 | 0 | singleDist[chromaId][tuIterator.section] = nonZeroDistC; |
5266 | 0 | singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC; |
5267 | 0 | } |
5268 | 0 | else |
5269 | 0 | { |
5270 | | //zero-cost calculation for chroma. This is an approximation |
5271 | 0 | uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepth, (TextType)chromaId); |
5272 | |
|
5273 | 0 | if (nullCostC < singleCostC) |
5274 | 0 | { |
5275 | 0 | cbfFlag[chromaId][tuIterator.section] = 0; |
5276 | 0 | singleBits[chromaId][tuIterator.section] = 0; |
5277 | 0 | primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0); |
5278 | | #if CHECKED_BUILD || _DEBUG |
5279 | | uint32_t numCoeffC = 1 << (log2TrSizeC << 1); |
5280 | | memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC); |
5281 | | #endif |
5282 | 0 | if (checkTransformSkipC) |
5283 | 0 | minCost[chromaId][tuIterator.section] = nullCostC; |
5284 | 0 | singleDist[chromaId][tuIterator.section] = zeroDistC; |
5285 | 0 | singleEnergy[chromaId][tuIterator.section] = zeroEnergyC; |
5286 | 0 | } |
5287 | 0 | else |
5288 | 0 | { |
5289 | 0 | if (checkTransformSkipC) |
5290 | 0 | minCost[chromaId][tuIterator.section] = singleCostC; |
5291 | 0 | singleDist[chromaId][tuIterator.section] = nonZeroDistC; |
5292 | 0 | singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC; |
5293 | 0 | } |
5294 | 0 | } |
5295 | 0 | } |
5296 | 0 | else |
5297 | 0 | { |
5298 | 0 | if (checkTransformSkipC) |
5299 | 0 | minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId); |
5300 | 0 | primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0); |
5301 | 0 | singleBits[chromaId][tuIterator.section] = 0; |
5302 | 0 | singleDist[chromaId][tuIterator.section] = zeroDistC; |
5303 | 0 | singleEnergy[chromaId][tuIterator.section] = zeroEnergyC; |
5304 | 0 | } |
5305 | |
|
5306 | 0 | cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
5307 | 0 | } |
5308 | 0 | while (tuIterator.isNextSection()); |
5309 | 0 | } |
5310 | 0 | } |
5311 | |
|
5312 | 0 | if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400) |
5313 | 0 | { |
5314 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
5315 | 0 | { |
5316 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
5317 | 0 | do |
5318 | 0 | { |
5319 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
5320 | 0 | cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
5321 | 0 | } |
5322 | 0 | while(tuIterator.isNextSection()); |
5323 | 0 | } |
5324 | 0 | } |
5325 | 0 | if (checkTransformSkipY) |
5326 | 0 | { |
5327 | 0 | sse_t nonZeroDistY = 0; |
5328 | 0 | uint32_t nonZeroEnergyY = 0; |
5329 | 0 | uint64_t singleCostY = MAX_INT64; |
5330 | |
|
5331 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
5332 | |
|
5333 | 0 | cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth); |
5334 | |
|
5335 | 0 | if (bEnableRDOQ) |
5336 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); |
5337 | |
|
5338 | 0 | fenc = fencYuv->getLumaAddr(absPartIdx); |
5339 | 0 | resi = resiYuv.getLumaAddr(absPartIdx); |
5340 | 0 | uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true); |
5341 | |
|
5342 | 0 | if (numSigTSkipY) |
5343 | 0 | { |
5344 | 0 | m_entropyCoder.resetBits(); |
5345 | 0 | m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth); |
5346 | 0 | m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA); |
5347 | 0 | const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits(); |
5348 | |
|
5349 | 0 | m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY); |
5350 | 0 | bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0; |
5351 | |
|
5352 | 0 | bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0); |
5353 | 0 | primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize); |
5354 | 0 | nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize); |
5355 | |
|
5356 | 0 | if (m_rdCost.m_psyRd) |
5357 | 0 | { |
5358 | 0 | nonZeroEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize); |
5359 | 0 | singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY); |
5360 | 0 | } |
5361 | 0 | else if(m_rdCost.m_ssimRd) |
5362 | 0 | { |
5363 | 0 | nonZeroEnergyY = m_quant.ssimDistortion(cu, fenc, fencYuv->m_size, m_tsRecon, trSize, log2TrSize, TEXT_LUMA, absPartIdx); |
5364 | 0 | singleCostY = m_rdCost.calcSsimRdCost(nonZeroDistY, skipSingleBitsY, nonZeroEnergyY); |
5365 | 0 | } |
5366 | 0 | else |
5367 | 0 | singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY); |
5368 | 0 | } |
5369 | |
|
5370 | 0 | if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY) |
5371 | 0 | cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); |
5372 | 0 | else |
5373 | 0 | { |
5374 | 0 | singleDist[TEXT_LUMA][0] = nonZeroDistY; |
5375 | 0 | singleEnergy[TEXT_LUMA][0] = nonZeroEnergyY; |
5376 | 0 | cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY; |
5377 | 0 | bestTransformMode[TEXT_LUMA][0] = 1; |
5378 | 0 | if (m_param->limitTU) |
5379 | 0 | numSig[TEXT_LUMA][0] = numSigTSkipY; |
5380 | 0 | uint32_t numCoeffY = 1 << (log2TrSize << 1); |
5381 | 0 | memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY); |
5382 | 0 | primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize); |
5383 | 0 | } |
5384 | |
|
5385 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); |
5386 | 0 | } |
5387 | |
|
5388 | 0 | if (codeChroma && checkTransformSkipC) |
5389 | 0 | { |
5390 | 0 | sse_t nonZeroDistC = 0; |
5391 | 0 | uint32_t nonZeroEnergyC = 0; |
5392 | 0 | uint64_t singleCostC = MAX_INT64; |
5393 | 0 | uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; |
5394 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
5395 | |
|
5396 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
5397 | |
|
5398 | 0 | for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) |
5399 | 0 | { |
5400 | 0 | coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; |
5401 | 0 | TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); |
5402 | |
|
5403 | 0 | do |
5404 | 0 | { |
5405 | 0 | uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; |
5406 | 0 | uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); |
5407 | |
|
5408 | 0 | int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); |
5409 | |
|
5410 | 0 | cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
5411 | |
|
5412 | 0 | if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) |
5413 | 0 | m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); |
5414 | |
|
5415 | 0 | fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); |
5416 | 0 | resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); |
5417 | 0 | uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); |
5418 | |
|
5419 | 0 | m_entropyCoder.resetBits(); |
5420 | 0 | singleBits[chromaId][tuIterator.section] = 0; |
5421 | |
|
5422 | 0 | if (numSigTSkipC) |
5423 | 0 | { |
5424 | 0 | m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth); |
5425 | 0 | m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId); |
5426 | 0 | singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); |
5427 | |
|
5428 | 0 | m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff, |
5429 | 0 | log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC); |
5430 | 0 | bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0; |
5431 | 0 | bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0); |
5432 | 0 | primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC); |
5433 | 0 | nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC)); |
5434 | 0 | if (m_rdCost.m_psyRd) |
5435 | 0 | { |
5436 | 0 | nonZeroEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC); |
5437 | 0 | singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
5438 | 0 | } |
5439 | 0 | else if(m_rdCost.m_ssimRd) |
5440 | 0 | { |
5441 | 0 | nonZeroEnergyC = m_quant.ssimDistortion(cu, fenc, mode.fencYuv->m_csize, m_tsRecon, trSizeC, log2TrSizeC, (TextType)chromaId, absPartIdxC); |
5442 | 0 | singleCostC = m_rdCost.calcSsimRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroEnergyC); |
5443 | 0 | } |
5444 | 0 | else |
5445 | 0 | singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]); |
5446 | 0 | } |
5447 | |
|
5448 | 0 | if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC) |
5449 | 0 | cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
5450 | 0 | else |
5451 | 0 | { |
5452 | 0 | singleDist[chromaId][tuIterator.section] = nonZeroDistC; |
5453 | 0 | singleEnergy[chromaId][tuIterator.section] = nonZeroEnergyC; |
5454 | 0 | cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC; |
5455 | 0 | bestTransformMode[chromaId][tuIterator.section] = 1; |
5456 | 0 | uint32_t numCoeffC = 1 << (log2TrSizeC << 1); |
5457 | 0 | memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC); |
5458 | 0 | primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC); |
5459 | 0 | } |
5460 | |
|
5461 | 0 | cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); |
5462 | 0 | } |
5463 | 0 | while (tuIterator.isNextSection()); |
5464 | 0 | } |
5465 | 0 | } |
5466 | | |
5467 | | // Here we were encoding cbfs and coefficients, after calculating distortion above. |
5468 | | // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected |
5469 | | // bits required for coefficients and added with number of cbf bits. As I tested the order does not |
5470 | | // make any difference. But bit confused whether I should load the original context as below. |
5471 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
5472 | 0 | m_entropyCoder.resetBits(); |
5473 | | |
5474 | | //Encode cbf flags |
5475 | 0 | if (codeChroma) |
5476 | 0 | { |
5477 | 0 | if (!splitIntoSubTUs) |
5478 | 0 | { |
5479 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth); |
5480 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth); |
5481 | 0 | } |
5482 | 0 | else |
5483 | 0 | { |
5484 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx); |
5485 | 0 | offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx); |
5486 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][0], tuDepth); |
5487 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_U][1], tuDepth); |
5488 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][0], tuDepth); |
5489 | 0 | m_entropyCoder.codeQtCbfChroma(cbfFlag[TEXT_CHROMA_V][1], tuDepth); |
5490 | 0 | } |
5491 | 0 | } |
5492 | |
|
5493 | 0 | m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth); |
5494 | |
|
5495 | 0 | uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits(); |
5496 | |
|
5497 | 0 | uint32_t coeffBits = 0; |
5498 | 0 | coeffBits = singleBits[TEXT_LUMA][0]; |
5499 | 0 | for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) |
5500 | 0 | { |
5501 | 0 | coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex]; |
5502 | 0 | coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex]; |
5503 | 0 | } |
5504 | | |
5505 | | // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma. |
5506 | | // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for |
5507 | | // four split block's individual cbf value. This is not known before analysis of four split blocks. |
5508 | | // For that reason, I am collecting individual coefficient bits only. |
5509 | 0 | fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits; |
5510 | |
|
5511 | 0 | fullCost.distortion += singleDist[TEXT_LUMA][0]; |
5512 | 0 | fullCost.energy += singleEnergy[TEXT_LUMA][0];// need to check we need to add chroma also |
5513 | 0 | for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) |
5514 | 0 | { |
5515 | 0 | fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex]; |
5516 | 0 | fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex]; |
5517 | 0 | } |
5518 | |
|
5519 | 0 | if (m_rdCost.m_psyRd) |
5520 | 0 | fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
5521 | 0 | else if(m_rdCost.m_ssimRd) |
5522 | 0 | fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); |
5523 | 0 | else |
5524 | 0 | fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits); |
5525 | |
|
5526 | 0 | if (m_param->limitTU && bCheckSplit) |
5527 | 0 | { |
5528 | | // Stop recursion if the TU's energy level is minimal |
5529 | 0 | uint32_t numCoeff = trSize * trSize; |
5530 | 0 | if (cbfFlag[TEXT_LUMA][0] == 0) |
5531 | 0 | bCheckSplit = false; |
5532 | 0 | else if (numSig[TEXT_LUMA][0] < (numCoeff / 64)) |
5533 | 0 | { |
5534 | 0 | uint32_t energy = 0; |
5535 | 0 | for (uint32_t i = 0; i < numCoeff; i++) |
5536 | 0 | energy += abs(coeffCurY[i]); |
5537 | 0 | if (energy == numSig[TEXT_LUMA][0]) |
5538 | 0 | bCheckSplit = false; |
5539 | 0 | } |
5540 | 0 | } |
5541 | |
|
5542 | 0 | if (bSaveTUData) |
5543 | 0 | { |
5544 | 0 | for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++) |
5545 | 0 | { |
5546 | 0 | for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++) |
5547 | 0 | { |
5548 | 0 | m_cacheTU.bestTransformMode[idx][plane][part] = bestTransformMode[plane][part]; |
5549 | 0 | m_cacheTU.cbfFlag[idx][plane][part] = cbfFlag[plane][part]; |
5550 | 0 | } |
5551 | 0 | } |
5552 | 0 | m_cacheTU.cost[idx] = fullCost; |
5553 | 0 | m_entropyCoder.store(m_cacheTU.rqtStore[idx]); |
5554 | 0 | } |
5555 | 0 | } |
5556 | 0 | if (bLoadTUData) |
5557 | 0 | { |
5558 | 0 | for (int plane = 0; plane < MAX_NUM_COMPONENT; plane++) |
5559 | 0 | { |
5560 | 0 | for(int part = 0; part < (m_csp == X265_CSP_I422) + 1; part++) |
5561 | 0 | { |
5562 | 0 | bestTransformMode[plane][part] = m_cacheTU.bestTransformMode[idx][plane][part]; |
5563 | 0 | cbfFlag[plane][part] = m_cacheTU.cbfFlag[idx][plane][part]; |
5564 | 0 | } |
5565 | 0 | } |
5566 | 0 | fullCost = m_cacheTU.cost[idx]; |
5567 | 0 | m_entropyCoder.load(m_cacheTU.rqtStore[idx]); |
5568 | 0 | bCheckFull = true; |
5569 | 0 | } |
5570 | | |
5571 | | // code sub-blocks |
5572 | 0 | if (bCheckSplit) |
5573 | 0 | { |
5574 | 0 | if (bCheckFull) |
5575 | 0 | { |
5576 | 0 | m_entropyCoder.store(m_rqt[depth].rqtTest); |
5577 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
5578 | 0 | } |
5579 | |
|
5580 | 0 | Cost splitCost; |
5581 | 0 | if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) |
5582 | 0 | { |
5583 | | // Subdiv flag can be encoded at the start of analysis of split blocks. |
5584 | 0 | m_entropyCoder.resetBits(); |
5585 | 0 | m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); |
5586 | 0 | splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); |
5587 | 0 | } |
5588 | |
|
5589 | 0 | bool yCbCrCbf = splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 0); |
5590 | 0 | if (yCbCrCbf || !bCheckFull) |
5591 | 0 | { |
5592 | 0 | if (splitCost.rdcost < fullCost.rdcost) |
5593 | 0 | { |
5594 | 0 | if (m_limitTU & X265_TU_LIMIT_BFS) |
5595 | 0 | { |
5596 | 0 | uint32_t nextlog2TrSize = cuGeom.log2CUSize - (tuDepth + 1); |
5597 | 0 | bool nextSplit = nextlog2TrSize > depthRange[0]; |
5598 | 0 | if (nextSplit) |
5599 | 0 | { |
5600 | 0 | m_entropyCoder.load(m_rqt[depth].rqtRoot); |
5601 | 0 | splitCost.bits = splitCost.distortion = splitCost.rdcost = splitCost.energy = 0; |
5602 | 0 | if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) |
5603 | 0 | { |
5604 | | // Subdiv flag can be encoded at the start of analysis of split blocks. |
5605 | 0 | m_entropyCoder.resetBits(); |
5606 | 0 | m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); |
5607 | 0 | splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); |
5608 | 0 | } |
5609 | 0 | splitTU(mode, cuGeom, absPartIdx, tuDepth, resiYuv, splitCost, depthRange, 1); |
5610 | 0 | } |
5611 | 0 | } |
5612 | 0 | outCosts.distortion += splitCost.distortion; |
5613 | 0 | outCosts.rdcost += splitCost.rdcost; |
5614 | 0 | outCosts.bits += splitCost.bits; |
5615 | 0 | outCosts.energy += splitCost.energy; |
5616 | 0 | return; |
5617 | 0 | } |
5618 | 0 | else |
5619 | 0 | outCosts.energy += splitCost.energy; |
5620 | 0 | } |
5621 | | |
5622 | 0 | cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth); |
5623 | 0 | if (codeChroma) |
5624 | 0 | { |
5625 | 0 | if (!splitIntoSubTUs) |
5626 | 0 | { |
5627 | 0 | cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth); |
5628 | 0 | cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth); |
5629 | 0 | } |
5630 | 0 | else |
5631 | 0 | { |
5632 | 0 | uint32_t tuNumParts = absPartIdxStep >> 1; |
5633 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts); |
5634 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); |
5635 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts); |
5636 | 0 | cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); |
5637 | 0 | } |
5638 | 0 | } |
5639 | 0 | X265_CHECK(bCheckFull, "check-full must be set\n"); |
5640 | 0 | m_entropyCoder.load(m_rqt[depth].rqtTest); |
5641 | 0 | } |
5642 | | |
5643 | 0 | cu.setTUDepthSubParts(tuDepth, absPartIdx, depth); |
5644 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); |
5645 | |
|
5646 | 0 | if (codeChroma) |
5647 | 0 | { |
5648 | 0 | if (!splitIntoSubTUs) |
5649 | 0 | { |
5650 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth); |
5651 | 0 | cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth); |
5652 | 0 | } |
5653 | 0 | else |
5654 | 0 | { |
5655 | 0 | uint32_t tuNumParts = absPartIdxStep >> 1; |
5656 | |
|
5657 | 0 | offsetCBFs(cbfFlag[TEXT_CHROMA_U]); |
5658 | 0 | offsetCBFs(cbfFlag[TEXT_CHROMA_V]); |
5659 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts); |
5660 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); |
5661 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts); |
5662 | 0 | cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); |
5663 | 0 | } |
5664 | 0 | } |
5665 | |
|
5666 | 0 | outCosts.distortion += fullCost.distortion; |
5667 | 0 | outCosts.rdcost += fullCost.rdcost; |
5668 | 0 | outCosts.bits += fullCost.bits; |
5669 | 0 | outCosts.energy += fullCost.energy; |
5670 | 0 | } |
5671 | | |
5672 | | void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]) |
5673 | 0 | { |
5674 | 0 | X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n"); |
5675 | |
|
5676 | 0 | const bool bSubdiv = tuDepth < cu.m_tuDepth[absPartIdx]; |
5677 | 0 | uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
5678 | 0 | if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) |
5679 | 0 | { |
5680 | 0 | if (!(log2TrSize - m_hChromaShift < 2)) |
5681 | 0 | { |
5682 | 0 | uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2); |
5683 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1)) |
5684 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv); |
5685 | 0 | if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1)) |
5686 | 0 | m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv); |
5687 | 0 | } |
5688 | 0 | } |
5689 | |
|
5690 | 0 | if (!bSubdiv) |
5691 | 0 | { |
5692 | 0 | m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); |
5693 | 0 | } |
5694 | 0 | else |
5695 | 0 | { |
5696 | 0 | uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2; |
5697 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
5698 | 0 | codeInterSubdivCbfQT(cu, absPartIdx, tuDepth + 1, depthRange); |
5699 | 0 | } |
5700 | 0 | } |
5701 | | |
5702 | | void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth) |
5703 | 0 | { |
5704 | 0 | const uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth; |
5705 | |
|
5706 | 0 | if (tuDepth < cu.m_tuDepth[absPartIdx]) |
5707 | 0 | { |
5708 | 0 | uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; |
5709 | 0 | for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) |
5710 | 0 | saveResidualQTData(cu, resiYuv, absPartIdx, tuDepth + 1); |
5711 | 0 | return; |
5712 | 0 | } |
5713 | | |
5714 | 0 | const uint32_t qtLayer = log2TrSize - 2; |
5715 | |
|
5716 | 0 | uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; |
5717 | 0 | uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0; |
5718 | 0 | if (log2TrSizeC < 2) |
5719 | 0 | { |
5720 | 0 | X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); |
5721 | 0 | log2TrSizeC = 2; |
5722 | 0 | codeChroma &= !(absPartIdx & 3); |
5723 | 0 | } |
5724 | |
|
5725 | 0 | m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize); |
5726 | |
|
5727 | 0 | uint32_t numCoeffY = 1 << (log2TrSize * 2); |
5728 | 0 | uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2; |
5729 | 0 | coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; |
5730 | 0 | coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY; |
5731 | 0 | memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY); |
5732 | |
|
5733 | 0 | if (codeChroma) |
5734 | 0 | { |
5735 | 0 | m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift); |
5736 | |
|
5737 | 0 | uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); |
5738 | 0 | uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); |
5739 | |
|
5740 | 0 | coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; |
5741 | 0 | coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; |
5742 | 0 | coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; |
5743 | 0 | coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC; |
5744 | 0 | memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC); |
5745 | 0 | memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC); |
5746 | 0 | } |
5747 | 0 | } |
5748 | | |
5749 | | /* returns the number of bits required to signal a non-most-probable mode. |
5750 | | * on return mpms contains bitmap of most probable modes */ |
5751 | | uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const |
5752 | 0 | { |
5753 | 0 | cu.getIntraDirLumaPredictor(absPartIdx, mpmModes); |
5754 | |
|
5755 | 0 | mpms = 0; |
5756 | 0 | for (int i = 0; i < 3; ++i) |
5757 | 0 | mpms |= ((uint64_t)1 << mpmModes[i]); |
5758 | |
|
5759 | 0 | return m_entropyCoder.bitsIntraModeNonMPM(); |
5760 | 0 | } |
5761 | | |
5762 | | /* swap the current mode/cost with the mode with the highest cost in the |
5763 | | * current candidate list, if its cost is better (maintain a top N list) */ |
5764 | | void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList) |
5765 | 0 | { |
5766 | 0 | uint32_t maxIndex = 0; |
5767 | 0 | uint64_t maxValue = 0; |
5768 | |
|
5769 | 0 | for (int i = 0; i < maxCandCount; i++) |
5770 | 0 | { |
5771 | 0 | if (maxValue < candCostList[i]) |
5772 | 0 | { |
5773 | 0 | maxValue = candCostList[i]; |
5774 | 0 | maxIndex = i; |
5775 | 0 | } |
5776 | 0 | } |
5777 | |
|
5778 | 0 | if (cost < maxValue) |
5779 | 0 | { |
5780 | 0 | candCostList[maxIndex] = cost; |
5781 | 0 | candModeList[maxIndex] = mode; |
5782 | 0 | } |
5783 | 0 | } |
5784 | | |
5785 | | void Search::checkDQP(Mode& mode, const CUGeom& cuGeom) |
5786 | 0 | { |
5787 | 0 | CUData& cu = mode.cu; |
5788 | 0 | if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth) |
5789 | 0 | { |
5790 | 0 | if (cu.getQtRootCbf(0)) |
5791 | 0 | { |
5792 | 0 | if (m_param->rdLevel >= 3) |
5793 | 0 | { |
5794 | 0 | mode.contexts.resetBits(); |
5795 | 0 | mode.contexts.codeDeltaQP(cu, 0); |
5796 | 0 | uint32_t bits = mode.contexts.getNumberOfWrittenBits(); |
5797 | 0 | mode.totalBits += bits; |
5798 | 0 | updateModeCost(mode); |
5799 | 0 | } |
5800 | 0 | else if (m_param->rdLevel <= 1) |
5801 | 0 | { |
5802 | 0 | mode.sa8dBits++; |
5803 | 0 | mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits); |
5804 | 0 | } |
5805 | 0 | else |
5806 | 0 | { |
5807 | 0 | mode.totalBits++; |
5808 | 0 | updateModeCost(mode); |
5809 | 0 | } |
5810 | 0 | } |
5811 | 0 | else |
5812 | 0 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); |
5813 | 0 | } |
5814 | 0 | } |
5815 | | |
5816 | | void Search::checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom) |
5817 | 0 | { |
5818 | 0 | CUData& cu = mode.cu; |
5819 | |
|
5820 | 0 | if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP) |
5821 | 0 | { |
5822 | 0 | bool hasResidual = false; |
5823 | | |
5824 | | /* Check if any sub-CU has a non-zero QP */ |
5825 | 0 | for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++) |
5826 | 0 | { |
5827 | 0 | if (cu.getQtRootCbf(blkIdx)) |
5828 | 0 | { |
5829 | 0 | hasResidual = true; |
5830 | 0 | break; |
5831 | 0 | } |
5832 | 0 | } |
5833 | 0 | if (hasResidual) |
5834 | 0 | { |
5835 | 0 | if (m_param->rdLevel >= 3) |
5836 | 0 | { |
5837 | 0 | mode.contexts.resetBits(); |
5838 | 0 | mode.contexts.codeDeltaQP(cu, 0); |
5839 | 0 | uint32_t bits = mode.contexts.getNumberOfWrittenBits(); |
5840 | 0 | mode.totalBits += bits; |
5841 | 0 | updateModeCost(mode); |
5842 | 0 | } |
5843 | 0 | else if (m_param->rdLevel <= 1) |
5844 | 0 | { |
5845 | 0 | mode.sa8dBits++; |
5846 | 0 | mode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)mode.distortion, mode.sa8dBits); |
5847 | 0 | } |
5848 | 0 | else |
5849 | 0 | { |
5850 | 0 | mode.totalBits++; |
5851 | 0 | updateModeCost(mode); |
5852 | 0 | } |
5853 | | /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled). |
5854 | | When the non-zero CBF sub-CU is found, stop */ |
5855 | 0 | cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); |
5856 | 0 | } |
5857 | 0 | else |
5858 | | /* No residual within this CU or subCU, so reset QP to RefQP */ |
5859 | 0 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); |
5860 | 0 | } |
5861 | 0 | } |