/work/vvenc/source/Lib/EncoderLib/EncSlice.cpp
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | |
44 | | /** \file EncSlice.cpp |
45 | | \brief slice encoder class |
46 | | */ |
47 | | |
48 | | #include "EncSlice.h" |
49 | | #include "EncStage.h" |
50 | | #include "EncLib.h" |
51 | | #include "EncPicture.h" |
52 | | #include "BitAllocation.h" |
53 | | #include "CommonLib/UnitTools.h" |
54 | | #include "CommonLib/Picture.h" |
55 | | #include "CommonLib/TimeProfiler.h" |
56 | | #include "CommonLib/dtrace_codingstruct.h" |
57 | | #include "Utilities/NoMallocThreadPool.h" |
58 | | |
59 | | #include <math.h> |
60 | | #include "vvenc/vvencCfg.h" |
61 | | |
62 | | //! \ingroup EncoderLib |
63 | | //! \{ |
64 | | |
65 | | namespace vvenc { |
66 | | |
67 | | #ifdef TRACE_ENABLE_ITT |
68 | | static const __itt_domain* itt_domain_encode = __itt_domain_create( "Encode" ); |
69 | | static const __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( "Encode_CTU" ); |
70 | | static const __itt_string_handle* itt_handle_rspLfVer = __itt_string_handle_create( "RspLfVer_CTU" ); |
71 | | static const __itt_string_handle* itt_handle_lfHor = __itt_string_handle_create( "LfHor_CTU" ); |
72 | | static const __itt_string_handle* itt_handle_sao = __itt_string_handle_create( "SAO_CTU" ); |
73 | | static const __itt_string_handle* itt_handle_alf_stat = __itt_string_handle_create( "ALF_CTU_STAT" ); |
74 | | static const __itt_string_handle* itt_handle_alf_derive = __itt_string_handle_create( "ALF_DERIVE" ); |
75 | | static const __itt_string_handle* itt_handle_alf_recon = __itt_string_handle_create( "ALF_RECONSTRUCT" ); |
76 | | static const __itt_string_handle* itt_handle_ccalf_stat = __itt_string_handle_create( "CCALF_CTU_STAT" ); |
77 | | static const __itt_string_handle* itt_handle_ccalf_derive = __itt_string_handle_create( "CCALF_DERIVE" ); |
78 | | static const __itt_string_handle* itt_handle_ccalf_recon = __itt_string_handle_create( "CCALF_RECONSTRUCT" ); |
79 | | #endif |
80 | | |
81 | | void setArbitraryWppPattern( const PreCalcValues& pcv, std::vector<int>& ctuAddrMap, int stepX = 1 ) |
82 | 5.19k | { |
83 | 5.19k | ctuAddrMap.resize( pcv.sizeInCtus, 0 ); |
84 | 5.19k | std::vector<int> x_in_line( pcv.heightInCtus, 0 ); |
85 | 5.19k | int x = 0, y = 0, addr = 0; |
86 | 5.19k | int y_top = 0; |
87 | 5.19k | const int step = stepX; // number of CTUs in x-direction to scan |
88 | 5.19k | ctuAddrMap[addr++] = x++; // first entry (can be omitted) |
89 | 12.1k | while( addr < pcv.sizeInCtus ) |
90 | 7.87k | { |
91 | | // fill entries in x-direction |
92 | 7.87k | int x1 = x; |
93 | 18.8k | while( x < std::min(x1 + step, (int)pcv.widthInCtus) ) |
94 | 11.0k | { |
95 | | // general WPP condition (top-right CTU availability) |
96 | 11.0k | if( y > 0 && !( x_in_line[y - 1] - x >= 2 ) && x != pcv.widthInCtus - 1 ) |
97 | 0 | break; |
98 | 11.0k | ctuAddrMap[addr++] = y*pcv.widthInCtus + x; |
99 | 11.0k | x++; |
100 | 11.0k | } |
101 | 7.87k | x_in_line[y] = x; |
102 | | |
103 | 7.87k | y += 1; |
104 | | |
105 | 7.87k | if( y >= pcv.heightInCtus ) |
106 | 4.00k | { |
107 | | // go up |
108 | 4.00k | if( x_in_line[y_top] >= pcv.widthInCtus ) |
109 | 4.00k | { |
110 | 4.00k | y_top++; |
111 | 4.00k | if( y_top >= pcv.heightInCtus ) |
112 | 936 | { |
113 | | // done |
114 | 936 | break; |
115 | 936 | } |
116 | 4.00k | } |
117 | 3.07k | y = y_top; |
118 | 3.07k | } |
119 | 6.94k | x = x_in_line[y]; |
120 | | |
121 | 6.94k | CHECK( y >= pcv.heightInCtus, "Height in CTUs is exceeded" ); |
122 | 6.94k | } |
123 | 5.19k | } |
124 | | |
125 | | struct TileLineEncRsrc |
126 | | { |
127 | | BitEstimator m_BitEstimator; |
128 | | CABACWriter m_CABACEstimator; |
129 | | BitEstimator m_SaoBitEstimator; |
130 | | CABACWriter m_SaoCABACEstimator; |
131 | | BitEstimator m_AlfBitEstimator; |
132 | | CABACWriter m_AlfCABACEstimator; |
133 | | ReuseUniMv m_ReuseUniMv; |
134 | | BlkUniMvInfoBuffer m_BlkUniMvInfoBuffer; |
135 | | AffineProfList m_AffineProfList; |
136 | | IbcBvCand m_CachedBvs; |
137 | | EncSampleAdaptiveOffset m_encSao; |
138 | | int m_prevQp[ MAX_NUM_CH ]; |
139 | 9.06k | TileLineEncRsrc( const VVEncCfg& encCfg ) : m_CABACEstimator( m_BitEstimator ), m_SaoCABACEstimator( m_SaoBitEstimator ), m_AlfCABACEstimator( m_AlfBitEstimator ) { m_AffineProfList.init( ! encCfg.m_picReordering ); } |
140 | | }; |
141 | | |
142 | | struct PerThreadRsrc |
143 | | { |
144 | | CtxCache m_CtxCache; |
145 | | EncCu m_encCu; |
146 | | PelStorage m_alfTempCtuBuf; |
147 | | }; |
148 | | |
149 | | struct CtuEncParam |
150 | | { |
151 | | Picture* pic; |
152 | | EncSlice* encSlice; |
153 | | int ctuRsAddr; |
154 | | int ctuPosX; |
155 | | int ctuPosY; |
156 | | UnitArea ctuArea; |
157 | | int tileLineResIdx; |
158 | | |
159 | 16.2k | CtuEncParam() : pic( nullptr ), encSlice( nullptr ), ctuRsAddr( 0 ), ctuPosX( 0 ), ctuPosY( 0 ), ctuArea(), tileLineResIdx( 0 ) {} |
160 | | CtuEncParam( Picture* _p, EncSlice* _s, const int _r, const int _x, const int _y, const int _tileLineResIdx ) |
161 | | : pic( _p ) |
162 | | , encSlice( _s ) |
163 | | , ctuRsAddr( _r ) |
164 | | , ctuPosX( _x ) |
165 | | , ctuPosY( _y ) |
166 | | , ctuArea( pic->chromaFormat, pic->slices[0]->pps->pcv->getCtuArea( _x, _y ) ) |
167 | 0 | , tileLineResIdx( _tileLineResIdx ) {} |
168 | | }; |
169 | | |
170 | | // ==================================================================================================================== |
171 | | // Constructor / destructor / create / destroy |
172 | | // ==================================================================================================================== |
173 | | |
174 | | EncSlice::EncSlice() |
175 | 5.19k | : m_pcEncCfg ( nullptr) |
176 | 5.19k | , m_threadPool ( nullptr ) |
177 | 5.19k | , m_ctuTasksDoneCounter( nullptr ) |
178 | 5.19k | , m_ctuEncDelay ( 1 ) |
179 | 5.19k | , m_pLoopFilter ( nullptr ) |
180 | 5.19k | , m_pALF ( nullptr ) |
181 | 5.19k | , m_pcRateCtrl ( nullptr ) |
182 | 5.19k | , m_CABACWriter ( m_BinEncoder ) |
183 | 5.19k | , m_encCABACTableIdx ( VVENC_I_SLICE ) |
184 | 5.19k | { |
185 | 5.19k | } |
186 | | |
187 | | |
188 | | EncSlice::~EncSlice() |
189 | 5.19k | { |
190 | 5.19k | for( auto* lnRsc : m_TileLineEncRsrc ) |
191 | 9.06k | { |
192 | 9.06k | delete lnRsc; |
193 | 9.06k | } |
194 | 5.19k | m_TileLineEncRsrc.clear(); |
195 | | |
196 | 5.19k | for( auto* taskRsc: m_ThreadRsrc ) |
197 | 20.7k | { |
198 | 20.7k | taskRsc->m_alfTempCtuBuf.destroy(); |
199 | 20.7k | delete taskRsc; |
200 | 20.7k | } |
201 | 5.19k | m_ThreadRsrc.clear(); |
202 | | |
203 | 5.19k | m_saoReconParams.clear(); |
204 | | |
205 | 21.3k | for( int i = 0; i < m_saoStatData.size(); i++ ) |
206 | 16.2k | { |
207 | 64.8k | for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ ) |
208 | 48.6k | { |
209 | 48.6k | delete[] m_saoStatData[ i ][ compIdx ]; |
210 | 48.6k | } |
211 | 16.2k | delete[] m_saoStatData[ i ]; |
212 | 16.2k | } |
213 | 5.19k | m_saoStatData.clear(); |
214 | 5.19k | } |
215 | | |
216 | | void EncSlice::init( const VVEncCfg& encCfg, |
217 | | const SPS& sps, |
218 | | const PPS& pps, |
219 | | std::vector<int>* const globalCtuQpVector, |
220 | | LoopFilter& loopFilter, |
221 | | EncAdaptiveLoopFilter& alf, |
222 | | RateCtrl& rateCtrl, |
223 | | NoMallocThreadPool* threadPool, |
224 | | WaitCounter* ctuTasksDoneCounter ) |
225 | 5.19k | { |
226 | 5.19k | m_pcEncCfg = &encCfg; |
227 | 5.19k | m_pLoopFilter = &loopFilter; |
228 | 5.19k | m_pALF = &alf; |
229 | 5.19k | m_pcRateCtrl = &rateCtrl; |
230 | 5.19k | m_threadPool = threadPool; |
231 | 5.19k | m_ctuTasksDoneCounter = ctuTasksDoneCounter; |
232 | 5.19k | m_syncPicCtx.resize( encCfg.m_entropyCodingSyncEnabled ? pps.getNumTileLineIds() : 0 ); |
233 | | |
234 | | |
235 | 5.19k | const int maxCntRscr = ( encCfg.m_numThreads > 0 ) ? pps.getNumTileLineIds() : 1; |
236 | 5.19k | const int maxCtuEnc = ( encCfg.m_numThreads > 0 && threadPool ) ? threadPool->numThreads() : 1; |
237 | | |
238 | 5.19k | m_ThreadRsrc.resize( maxCtuEnc, nullptr ); |
239 | 5.19k | m_TileLineEncRsrc.resize( maxCntRscr, nullptr ); |
240 | | |
241 | 5.19k | for( PerThreadRsrc*& taskRsc : m_ThreadRsrc ) |
242 | 20.7k | { |
243 | 20.7k | taskRsc = new PerThreadRsrc(); |
244 | 20.7k | taskRsc->m_encCu.init( encCfg, |
245 | 20.7k | sps, |
246 | 20.7k | globalCtuQpVector, |
247 | 20.7k | m_syncPicCtx.data(), |
248 | 20.7k | &rateCtrl ); |
249 | 20.7k | taskRsc->m_alfTempCtuBuf.create( pps.pcv->chrFormat, Area( 0, 0, pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1), pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1) ), pps.pcv->maxCUSize, MAX_ALF_PADDING_SIZE, 0, false ); |
250 | 20.7k | } |
251 | | |
252 | 5.19k | for( TileLineEncRsrc*& lnRsc : m_TileLineEncRsrc ) |
253 | 9.06k | { |
254 | 9.06k | lnRsc = new TileLineEncRsrc( encCfg ); |
255 | 9.06k | if( sps.saoEnabled ) |
256 | 9.06k | { |
257 | 9.06k | lnRsc->m_encSao.init( encCfg ); |
258 | 9.06k | } |
259 | 9.06k | } |
260 | | |
261 | 5.19k | const int sizeInCtus = pps.pcv->sizeInCtus; |
262 | 5.19k | m_processStates = std::vector<ProcessCtuState>( sizeInCtus ); |
263 | 5.19k | m_saoReconParams.resize( sizeInCtus ); |
264 | | |
265 | 5.19k | ::memset( m_saoDisabledRate, 0, sizeof( m_saoDisabledRate ) ); |
266 | | |
267 | | // sao statistics |
268 | 5.19k | if( encCfg.m_bUseSAO ) |
269 | 5.19k | { |
270 | 5.19k | m_saoStatData.resize( sizeInCtus ); |
271 | 21.3k | for( int i = 0; i < sizeInCtus; i++ ) |
272 | 16.2k | { |
273 | 16.2k | m_saoStatData[ i ] = new SAOStatData*[ MAX_NUM_COMP ]; |
274 | 64.8k | for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ ) |
275 | 48.6k | { |
276 | 48.6k | m_saoStatData[ i ][ compIdx ] = new SAOStatData[ NUM_SAO_NEW_TYPES ]; |
277 | 48.6k | } |
278 | 16.2k | } |
279 | 5.19k | } |
280 | 5.19k | ctuEncParams.resize( sizeInCtus ); |
281 | 5.19k | setArbitraryWppPattern( *pps.pcv, m_ctuAddrMap, 3 ); |
282 | | |
283 | 5.19k | const unsigned asuHeightInCtus = m_pALF->getAsuHeightInCtus(); |
284 | 5.19k | const unsigned numDeriveLines = encCfg.m_ifpLines ? |
285 | 5.19k | std::min( ((encCfg.m_ifpLines & (~(asuHeightInCtus - 1))) + asuHeightInCtus), pps.pcv->heightInCtus ) : pps.pcv->heightInCtus; |
286 | 5.19k | m_alfDeriveCtu = numDeriveLines * pps.pcv->widthInCtus - 1; |
287 | 5.19k | m_ccalfDeriveCtu = encCfg.m_ifpLines ? pps.pcv->widthInCtus * std::min((unsigned)encCfg.m_ifpLines + 1, pps.pcv->heightInCtus) - 1: pps.pcv->sizeInCtus - 1; |
288 | 5.19k | } |
289 | | |
290 | | |
291 | | void EncSlice::initPic( Picture* pic ) |
292 | 1.29k | { |
293 | 1.29k | Slice* slice = pic->cs->slice; |
294 | | |
295 | 1.29k | if( slice->pps->numTileCols * slice->pps->numTileRows > 1 ) |
296 | 0 | { |
297 | 0 | slice->sliceMap = slice->pps->sliceMap[0]; |
298 | 0 | } |
299 | 1.29k | else |
300 | 1.29k | { |
301 | 1.29k | slice->sliceMap.addCtusToSlice( 0, pic->cs->pcv->widthInCtus, 0, pic->cs->pcv->heightInCtus, pic->cs->pcv->widthInCtus); |
302 | 1.29k | } |
303 | | |
304 | | // this ensures that independently encoded bitstream chunks can be combined to bit-equal |
305 | 1.29k | const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx; |
306 | 1.29k | slice->encCABACTableIdx = cabacTableIdx; |
307 | | |
308 | | // set QP and lambda values |
309 | 1.29k | xInitSliceLambdaQP( slice ); |
310 | | |
311 | 1.29k | for( auto* thrRsc : m_ThreadRsrc ) |
312 | 5.19k | { |
313 | 5.19k | thrRsc->m_encCu.initPic( pic ); |
314 | 5.19k | } |
315 | | |
316 | 1.29k | for( auto* lnRsc : m_TileLineEncRsrc ) |
317 | 2.26k | { |
318 | 2.26k | lnRsc->m_ReuseUniMv.resetReusedUniMvs(); |
319 | 2.26k | } |
320 | | |
321 | 1.29k | m_ctuEncDelay = 1; |
322 | 1.29k | if( pic->useIBC ) |
323 | 1.29k | { |
324 | | // IBC needs unfiltered samples up to max IBC search range |
325 | | // therefore ensure that numCtuDelayLUT CTU's have been enocded first |
326 | | // assuming IBC localSearchRangeX / Y = 128 |
327 | 1.29k | const int numCtuDelayLUT[ 3 ] = { 15, 3, 1 }; |
328 | 1.29k | CHECK( pic->cs->pcv->maxCUSizeLog2 < 5 || pic->cs->pcv->maxCUSizeLog2 > 7, "invalid max CTUSize" ); |
329 | 1.29k | m_ctuEncDelay = numCtuDelayLUT[ pic->cs->pcv->maxCUSizeLog2 - 5 ]; |
330 | 1.29k | } |
331 | 1.29k | } |
332 | | |
333 | | |
334 | | |
335 | | void EncSlice::xInitSliceLambdaQP( Slice* slice ) |
336 | 1.29k | { |
337 | | // pre-compute lambda and QP |
338 | 1.29k | const bool rcp = (m_pcEncCfg->m_RCTargetBitrate > 0 && slice->pic->picInitialQP >= 0); // 2nd pass |
339 | 1.29k | int iQP = Clip3 (-slice->sps->qpBDOffset[CH_L], MAX_QP, slice->pic->picInitialQP); // RC start QP |
340 | 1.29k | double dQP = (rcp ? (double) slice->pic->picInitialQP : xGetQPForPicture (slice)); |
341 | 1.29k | double dLambda = (rcp ? slice->pic->picInitialLambda : xCalculateLambda (slice, slice->TLayer, dQP, dQP, iQP)); |
342 | 1.29k | int sliceChromaQpOffsetIntraOrPeriodic[2] = { m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[0], m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[1] }; |
343 | 1.29k | const int lookAheadRCCQpOffset = 0; // was (m_pcEncCfg->m_RCTargetBitrate > 0 && m_pcEncCfg->m_LookAhead && CS::isDualITree (*slice->pic->cs) ? 1 : 0); |
344 | 1.29k | int cbQP = 0, crQP = 0, cbCrQP = 0; |
345 | | |
346 | 1.29k | if (m_pcEncCfg->m_usePerceptQPA) // adapt sliceChromaQpOffsetIntraOrPeriodic and pic->ctuAdaptedQP |
347 | 1.29k | { |
348 | 1.29k | const bool cqp = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0); |
349 | 1.29k | const uint32_t startCtuTsAddr = slice->sliceMap.ctuAddrInSlice[0]; |
350 | 1.29k | const uint32_t boundingCtuTsAddr = slice->pic->cs->pcv->sizeInCtus; |
351 | | |
352 | 1.29k | if ((iQP = BitAllocation::applyQPAdaptationSlice (slice, m_pcEncCfg, iQP, dLambda, &slice->pic->picVA.visAct, // updates pic->picInitialQP |
353 | 1.29k | *m_ThreadRsrc[0]->m_encCu.getQpPtr(), m_pcRateCtrl->getIntraPQPAStats(), |
354 | 1.29k | (slice->pps->sliceChromaQpFlag && cqp ? sliceChromaQpOffsetIntraOrPeriodic : nullptr), |
355 | 1.29k | m_pcRateCtrl->getMinNoiseLevels(), startCtuTsAddr, boundingCtuTsAddr)) >= 0) // QP OK? |
356 | 1.29k | { |
357 | 1.29k | dLambda *= pow (2.0, ((double) iQP - dQP) / 3.0); // adjust lambda based on change of slice QP |
358 | 1.29k | } |
359 | 0 | else iQP = (int) dQP; // revert to unadapted slice QP |
360 | 1.29k | } |
361 | 0 | else if (rcp) |
362 | 0 | { |
363 | 0 | slice->pic->picInitialQP = -1; // no QPA - unused now |
364 | 0 | } |
365 | | |
366 | 1.29k | if (slice->pps->sliceChromaQpFlag && CS::isDualITree (*slice->pic->cs) && !m_pcEncCfg->m_usePerceptQPA && (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity == 0)) |
367 | 0 | { |
368 | 0 | cbQP = m_pcEncCfg->m_chromaCbQpOffsetDualTree + lookAheadRCCQpOffset; // QP offset for dual-tree |
369 | 0 | crQP = m_pcEncCfg->m_chromaCrQpOffsetDualTree + lookAheadRCCQpOffset; |
370 | 0 | cbCrQP = m_pcEncCfg->m_chromaCbCrQpOffsetDualTree + lookAheadRCCQpOffset; |
371 | 0 | } |
372 | 1.29k | else if (slice->pps->sliceChromaQpFlag) |
373 | 1.29k | { |
374 | 1.29k | const GOPEntry &gopEntry = *(slice->pic->gopEntry); |
375 | 1.29k | const bool bUseIntraOrPeriodicOffset = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0); |
376 | | |
377 | 1.29k | cbQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[0] : gopEntry.m_CbQPoffset) + lookAheadRCCQpOffset; |
378 | 1.29k | crQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[1] : gopEntry.m_CrQPoffset) + lookAheadRCCQpOffset; |
379 | 1.29k | cbCrQP = (cbQP + crQP) >> 1; // use floor of average CbCr chroma QP offset for joint-CbCr coding |
380 | | |
381 | 1.29k | cbQP = Clip3 (-12, 12, cbQP + slice->pps->chromaQpOffset[COMP_Cb]) - slice->pps->chromaQpOffset[COMP_Cb]; |
382 | 1.29k | crQP = Clip3 (-12, 12, crQP + slice->pps->chromaQpOffset[COMP_Cr]) - slice->pps->chromaQpOffset[COMP_Cr]; |
383 | 1.29k | cbCrQP = Clip3 (-12, 12, cbCrQP + slice->pps->chromaQpOffset[COMP_JOINT_CbCr]) - slice->pps->chromaQpOffset[COMP_JOINT_CbCr]; |
384 | 1.29k | } |
385 | | |
386 | 1.29k | slice->sliceChromaQpDelta[COMP_Cb] = Clip3 (-12, 12, cbQP); |
387 | 1.29k | slice->sliceChromaQpDelta[COMP_Cr] = Clip3 (-12, 12, crQP); |
388 | 1.29k | slice->sliceChromaQpDelta[COMP_JOINT_CbCr] = (slice->sps->jointCbCr ? Clip3 (-12, 12, cbCrQP) : 0); |
389 | | |
390 | 1.29k | for( auto& thrRsc : m_ThreadRsrc ) |
391 | 5.19k | { |
392 | 5.19k | thrRsc->m_encCu.setUpLambda( *slice, dLambda, iQP, true, true ); |
393 | 5.19k | } |
394 | | |
395 | 1.29k | slice->sliceQp = iQP; |
396 | 1.29k | slice->chromaQpAdjEnabled = slice->pps->chromaQpOffsetListLen > 0; |
397 | 1.29k | } |
398 | | |
399 | | static const int highTL[6] = { -1, 0, 0, 2, 4, 5 }; |
400 | | |
401 | | int EncSlice::xGetQPForPicture( const Slice* slice ) |
402 | 1.29k | { |
403 | 1.29k | const int lumaQpBDOffset = slice->sps->qpBDOffset[ CH_L ]; |
404 | 1.29k | int qp; |
405 | | |
406 | 1.29k | if ( m_pcEncCfg->m_costMode == VVENC_COST_LOSSLESS_CODING ) |
407 | 0 | { |
408 | 0 | qp = LOSSLESS_AND_MIXED_LOSSLESS_RD_COST_TEST_QP; |
409 | 0 | } |
410 | 1.29k | else |
411 | 1.29k | { |
412 | 1.29k | qp = m_pcEncCfg->m_QP + slice->pic->gopAdaptedQP; |
413 | | |
414 | 1.29k | if (m_pcEncCfg->m_usePerceptQPA) |
415 | 1.29k | { |
416 | 1.29k | const int tlayer = slice->pic->gopEntry->m_vtl; |
417 | | |
418 | 1.29k | qp = (slice->isIntra() ? std::min (qp, ((qp - std::min (3, floorLog2 (m_pcEncCfg->m_GOPSize) - 4/*TODO 3 with JVET-AC0149?*/)) * 15 + 3) >> 4) : highTL[tlayer] + ((qp * (16 + std::min (2, tlayer))) >> 4) + 0/*TODO +-1?*/); |
419 | 1.29k | } |
420 | 0 | else if( slice->isIntra() ) |
421 | 0 | { |
422 | 0 | qp += m_pcEncCfg->m_intraQPOffset; |
423 | 0 | } |
424 | 0 | else |
425 | 0 | { |
426 | 0 | if( qp != -lumaQpBDOffset ) |
427 | 0 | { |
428 | 0 | const GOPEntry &gopEntry = *(slice->pic->gopEntry); |
429 | | // adjust QP according to the QP offset for the GOP entry. |
430 | 0 | qp += gopEntry.m_QPOffset; |
431 | | |
432 | | // adjust QP according to QPOffsetModel for the GOP entry. |
433 | 0 | double dqpOffset = qp * gopEntry.m_QPOffsetModelScale + gopEntry.m_QPOffsetModelOffset + 0.5; |
434 | 0 | int qpOffset = (int)floor( Clip3<double>( 0.0, 3.0, dqpOffset ) ); |
435 | 0 | qp += qpOffset; |
436 | 0 | } |
437 | 0 | } |
438 | | |
439 | 1.29k | if( m_pcEncCfg->m_blockImportanceMapping && !slice->pic->m_picShared->m_ctuBimQpOffset.empty() ) |
440 | 0 | { |
441 | 0 | qp += slice->pic->m_picShared->m_picAuxQpOffset; |
442 | 0 | } |
443 | 1.29k | } |
444 | 1.29k | qp = Clip3( -lumaQpBDOffset, MAX_QP, qp ); |
445 | 1.29k | return qp; |
446 | 1.29k | } |
447 | | |
448 | | |
449 | | double EncSlice::xCalculateLambda( const Slice* slice, |
450 | | const int depth, // slice GOP hierarchical depth. |
451 | | const double refQP, // initial slice-level QP |
452 | | const double dQP, // initial double-precision QP |
453 | | int& iQP ) // returned integer QP. |
454 | 1.29k | { |
455 | 1.29k | const GOPEntry &gopEntry = *(slice->pic->gopEntry); |
456 | 1.29k | const int SHIFT_QP = 12; |
457 | 1.29k | const int temporalId = gopEntry.m_temporalId; |
458 | 1.29k | std::vector<double> intraLambdaModifiers; |
459 | 1.29k | for ( int i = 0; i < VVENC_MAX_TLAYER; i++ ) |
460 | 1.29k | { |
461 | 1.29k | if( m_pcEncCfg->m_adIntraLambdaModifier[i] != 0.0 ) intraLambdaModifiers.push_back( m_pcEncCfg->m_adIntraLambdaModifier[i] ); |
462 | 1.29k | else break; |
463 | 1.29k | } |
464 | | |
465 | 1.29k | int bitdepth_luma_qp_scale = 6 |
466 | 1.29k | * (slice->sps->bitDepths[ CH_L ] - 8 |
467 | 1.29k | - DISTORTION_PRECISION_ADJUSTMENT(slice->sps->bitDepths[ CH_L ])); |
468 | 1.29k | double qp_temp = dQP + bitdepth_luma_qp_scale - SHIFT_QP; |
469 | | // Case #1: I or P-slices (key-frame) |
470 | 1.29k | double dQPFactor = gopEntry.m_QPFactor; |
471 | 1.29k | if( slice->sliceType == VVENC_I_SLICE ) |
472 | 1.29k | { |
473 | 1.29k | if (m_pcEncCfg->m_dIntraQpFactor>=0.0 && gopEntry.m_sliceType != 'I') |
474 | 0 | { |
475 | 0 | dQPFactor = m_pcEncCfg->m_dIntraQpFactor; |
476 | 0 | } |
477 | 1.29k | else |
478 | 1.29k | { |
479 | 1.29k | dQPFactor = 0.57; |
480 | 1.29k | if( ! m_pcEncCfg->m_lambdaFromQPEnable ) |
481 | 0 | { |
482 | 0 | const int NumberBFrames = ( m_pcEncCfg->m_GOPSize - 1 ); |
483 | 0 | const double dLambda_scale = 1.0 - Clip3( 0.0, 0.5, 0.05 * (double)NumberBFrames ); |
484 | 0 | dQPFactor *= dLambda_scale; |
485 | 0 | } |
486 | 1.29k | } |
487 | 1.29k | } |
488 | 0 | else if( m_pcEncCfg->m_lambdaFromQPEnable ) |
489 | 0 | { |
490 | 0 | dQPFactor=0.57; |
491 | 0 | } |
492 | | |
493 | 1.29k | double dLambda = dQPFactor*pow( 2.0, qp_temp/3.0 ); |
494 | | |
495 | 1.29k | if( !(m_pcEncCfg->m_lambdaFromQPEnable) && depth>0 ) |
496 | 0 | { |
497 | 0 | double qp_temp_ref = refQP + bitdepth_luma_qp_scale - SHIFT_QP; |
498 | 0 | dLambda *= Clip3(2.00, 4.00, (qp_temp_ref / 6.0)); // (j == B_SLICE && p_cur_frm->layer != 0 ) |
499 | 0 | } |
500 | | |
501 | | // if hadamard is used in ME process |
502 | 1.29k | if ( !m_pcEncCfg->m_bUseHADME && slice->sliceType != VVENC_I_SLICE ) |
503 | 0 | { |
504 | 0 | dLambda *= 0.95; |
505 | 0 | } |
506 | | |
507 | 1.29k | double lambdaModifier; |
508 | 1.29k | if( slice->sliceType != VVENC_I_SLICE || intraLambdaModifiers.empty()) |
509 | 1.29k | { |
510 | 1.29k | lambdaModifier = m_pcEncCfg->m_adLambdaModifier[ temporalId ]; |
511 | 1.29k | } |
512 | 0 | else |
513 | 0 | { |
514 | 0 | lambdaModifier = intraLambdaModifiers[ (temporalId < intraLambdaModifiers.size()) ? temporalId : (intraLambdaModifiers.size()-1) ]; |
515 | 0 | } |
516 | 1.29k | dLambda *= lambdaModifier; |
517 | | |
518 | 1.29k | iQP = Clip3( -slice->sps->qpBDOffset[ CH_L ], MAX_QP, (int) floor( dQP + 0.5 ) ); |
519 | | |
520 | 1.29k | if( m_pcEncCfg->m_DepQuantEnabled ) |
521 | 1.29k | { |
522 | 1.29k | dLambda *= pow( 2.0, 0.25/3.0 ); // slight lambda adjustment for dependent quantization (due to different slope of quantizer) |
523 | 1.29k | } |
524 | | |
525 | | // NOTE: the lambda modifiers that are sometimes applied later might be best always applied in here. |
526 | 1.29k | return dLambda; |
527 | 1.29k | } |
528 | | |
529 | | |
530 | | // ==================================================================================================================== |
531 | | // Public member functions |
532 | | // ==================================================================================================================== |
533 | | |
534 | | |
535 | | /** \param pic picture class |
536 | | */ |
537 | | void EncSlice::compressSlice( Picture* pic ) |
538 | 1.29k | { |
539 | 1.29k | PROFILER_SCOPE_AND_STAGE( 1, g_timeProfiler, P_COMPRESS_SLICE ); |
540 | 1.29k | CodingStructure& cs = *pic->cs; |
541 | 1.29k | Slice* const slice = cs.slice; |
542 | 1.29k | uint32_t startCtuTsAddr = slice->sliceMap.ctuAddrInSlice[0]; |
543 | 1.29k | uint32_t boundingCtuTsAddr = pic->cs->pcv->sizeInCtus; |
544 | | |
545 | 1.29k | cs.pcv = slice->pps->pcv; |
546 | 1.29k | cs.fracBits = 0; |
547 | | |
548 | 1.29k | if( startCtuTsAddr == 0 ) |
549 | 1.29k | { |
550 | 1.29k | cs.initStructData( slice->sliceQp ); |
551 | 1.29k | } |
552 | | |
553 | 1.29k | for( auto* thrRsrc : m_ThreadRsrc ) |
554 | 5.19k | { |
555 | 5.19k | thrRsrc->m_encCu.initSlice( slice ); |
556 | 5.19k | } |
557 | | |
558 | 1.29k | for( auto* lnRsrc : m_TileLineEncRsrc ) |
559 | 2.26k | { |
560 | 2.26k | lnRsrc->m_CABACEstimator .initCtxModels( *slice ); |
561 | 2.26k | lnRsrc->m_SaoCABACEstimator .initCtxModels( *slice ); |
562 | 2.26k | lnRsrc->m_AlfCABACEstimator .initCtxModels( *slice ); |
563 | 2.26k | lnRsrc->m_AffineProfList .resetAffineMVList(); |
564 | 2.26k | lnRsrc->m_BlkUniMvInfoBuffer.resetUniMvList(); |
565 | 2.26k | lnRsrc->m_CachedBvs .resetIbcBvCand(); |
566 | | |
567 | 2.26k | if( slice->sps->saoEnabled && pic->useSAO ) |
568 | 2.26k | { |
569 | 2.26k | lnRsrc->m_encSao .initSlice( slice ); |
570 | 2.26k | } |
571 | 2.26k | } |
572 | | |
573 | 1.29k | if( slice->sps->fpelMmvd && !slice->picHeader->disFracMMVD ) |
574 | 1.29k | { |
575 | 1.29k | slice->picHeader->disFracMMVD = ( pic->lwidth() * pic->lheight() > 1920 * 1080 ) ? true : false; |
576 | 1.29k | } |
577 | | |
578 | 1.29k | xProcessCtus( pic, startCtuTsAddr, boundingCtuTsAddr ); |
579 | 1.29k | } |
580 | | |
581 | | void setJointCbCrModes( CodingStructure& cs, const Position topLeftLuma, const Size sizeLuma ) |
582 | 1.29k | { |
583 | 1.29k | bool sgnFlag = true; |
584 | | |
585 | 1.29k | if( isChromaEnabled( cs.picture->chromaFormat) ) |
586 | 1.29k | { |
587 | 1.29k | const CompArea cbArea = CompArea( COMP_Cb, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true ); |
588 | 1.29k | const CompArea crArea = CompArea( COMP_Cr, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true ); |
589 | | |
590 | 1.29k | const CPelBuf orgCb = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( cbArea ): cs.picture->getOrigBuf( cbArea ); |
591 | 1.29k | const CPelBuf orgCr = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( crArea ): cs.picture->getOrigBuf( crArea ); |
592 | 1.29k | const int x0 = ( cbArea.x > 0 ? 0 : 1 ); |
593 | 1.29k | const int y0 = ( cbArea.y > 0 ? 0 : 1 ); |
594 | 1.29k | const int x1 = ( cbArea.x + cbArea.width < cs.picture->Cb().width ? cbArea.width : cbArea.width - 1 ); |
595 | 1.29k | const int y1 = ( cbArea.y + cbArea.height < cs.picture->Cb().height ? cbArea.height : cbArea.height - 1 ); |
596 | 1.29k | const int cbs = orgCb.stride; |
597 | 1.29k | const int crs = orgCr.stride; |
598 | 1.29k | const Pel* pCb = orgCb.buf + y0 * cbs; |
599 | 1.29k | const Pel* pCr = orgCr.buf + y0 * crs; |
600 | 1.29k | int64_t sumCbCr = 0; |
601 | | |
602 | | // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes |
603 | 97.4k | for( int y = y0; y < y1; y++, pCb += cbs, pCr += crs ) |
604 | 96.1k | { |
605 | 7.68M | for( int x = x0; x < x1; x++ ) |
606 | 7.59M | { |
607 | 7.59M | int cb = ( 12*(int)pCb[x] - 2*((int)pCb[x-1] + (int)pCb[x+1] + (int)pCb[x-cbs] + (int)pCb[x+cbs]) - ((int)pCb[x-1-cbs] + (int)pCb[x+1-cbs] + (int)pCb[x-1+cbs] + (int)pCb[x+1+cbs]) ); |
608 | 7.59M | int cr = ( 12*(int)pCr[x] - 2*((int)pCr[x-1] + (int)pCr[x+1] + (int)pCr[x-crs] + (int)pCr[x+crs]) - ((int)pCr[x-1-crs] + (int)pCr[x+1-crs] + (int)pCr[x-1+crs] + (int)pCr[x+1+crs]) ); |
609 | 7.59M | sumCbCr += cb*cr; |
610 | 7.59M | } |
611 | 96.1k | } |
612 | | |
613 | 1.29k | sgnFlag = ( sumCbCr < 0 ); |
614 | 1.29k | } |
615 | | |
616 | 1.29k | cs.slice->picHeader->jointCbCrSign = sgnFlag; |
617 | 1.29k | } |
618 | | |
619 | | struct CtuPos |
620 | | { |
621 | | const int ctuPosX; |
622 | | const int ctuPosY; |
623 | | const int ctuRsAddr; |
624 | | |
625 | 4.05k | CtuPos( int _x, int _y, int _a ) : ctuPosX( _x ), ctuPosY( _y ), ctuRsAddr( _a ) {} |
626 | | }; |
627 | | |
628 | | class CtuTsIterator |
629 | | { |
630 | | private: |
631 | | const CodingStructure& cs; |
632 | | const int m_startTsAddr; |
633 | | const int m_endTsAddr; |
634 | | std::vector<int> m_ctuAddrMap; |
635 | | int m_ctuTsAddr; |
636 | | |
637 | | private: |
638 | | int getNextTsAddr( const int _tsAddr ) const |
639 | 4.05k | { |
640 | 4.05k | const PreCalcValues& pcv = *cs.pcv; |
641 | 4.05k | const int startSliceRsRow = m_startTsAddr / pcv.widthInCtus; |
642 | 4.05k | const int startSliceRsCol = m_startTsAddr % pcv.widthInCtus; |
643 | 4.05k | const int endSliceRsRow = (m_endTsAddr - 1) / pcv.widthInCtus; |
644 | 4.05k | const int endSliceRsCol = (m_endTsAddr - 1) % pcv.widthInCtus; |
645 | 4.05k | int ctuTsAddr = _tsAddr; |
646 | 4.05k | CHECK( ctuTsAddr > m_endTsAddr, "error: array index out of bounds" ); |
647 | 5.34k | while( ctuTsAddr < m_endTsAddr ) |
648 | 4.05k | { |
649 | 4.05k | ctuTsAddr++; |
650 | 4.05k | const int ctuRsAddr = ctuTsAddr; |
651 | 4.05k | if( cs.slice->pps->rectSlice |
652 | 4.05k | && ( (ctuRsAddr / pcv.widthInCtus) < startSliceRsRow |
653 | 4.05k | || (ctuRsAddr / pcv.widthInCtus) > endSliceRsRow |
654 | 2.75k | || (ctuRsAddr % pcv.widthInCtus) < startSliceRsCol |
655 | 2.75k | || (ctuRsAddr % pcv.widthInCtus) > endSliceRsCol ) ) |
656 | 1.29k | continue; |
657 | 2.75k | break; |
658 | 4.05k | } |
659 | 4.05k | return ctuTsAddr; |
660 | 4.05k | } |
661 | | |
662 | | int mapAddr( const int _addr ) const |
663 | 4.05k | { |
664 | 4.05k | if( _addr < 0 ) |
665 | 0 | return _addr; |
666 | 4.05k | if( _addr >= m_ctuAddrMap.size() ) |
667 | 0 | return _addr; |
668 | 4.05k | return m_ctuAddrMap[ _addr ]; |
669 | 4.05k | } |
670 | | |
671 | | public: |
672 | 1.29k | CtuTsIterator( const CodingStructure& _cs, int _s, int _e, std::vector<int>& _m ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {} |
673 | 0 | CtuTsIterator( const CodingStructure& _cs, int _s, int _e, bool _wpp ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuTsAddr( _s ) { if( _wpp ) setWppPattern(); } |
674 | 0 | CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {} |
675 | 1.29k | CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m, int _c ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( std::max( _s, _c ) ) {} |
676 | 1.29k | CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>* _m, bool _wpp ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuTsAddr( _s ) { if( _wpp ) m_ctuAddrMap = *_m; } |
677 | | |
678 | 9.24k | virtual ~CtuTsIterator() { m_ctuAddrMap.clear(); } |
679 | | |
680 | 4.05k | CtuTsIterator& operator++() { m_ctuTsAddr = getNextTsAddr( m_ctuTsAddr ); return *this; } |
681 | 0 | CtuTsIterator operator++(int) { auto retval = *this; ++(*this); return retval; } |
682 | 0 | bool operator==(CtuTsIterator other) const { return m_ctuTsAddr == other.m_ctuTsAddr; } |
683 | 5.34k | bool operator!=(CtuTsIterator other) const { return m_ctuTsAddr != other.m_ctuTsAddr; } |
684 | 4.05k | CtuPos operator*() const { const int ctuRsAddr = mapAddr( m_ctuTsAddr ); return CtuPos( ctuRsAddr % cs.pcv->widthInCtus, ctuRsAddr / cs.pcv->widthInCtus, ctuRsAddr ); } |
685 | | |
686 | 1.29k | CtuTsIterator begin() { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap ); }; |
687 | 1.29k | CtuTsIterator end() { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap, m_endTsAddr ); }; |
688 | | |
689 | | using iterator_category = std::forward_iterator_tag; |
690 | | using value_type = int; |
691 | | using pointer = int*; |
692 | | using reference = int&; |
693 | | using difference_type = ptrdiff_t; |
694 | | |
695 | | void setWppPattern() |
696 | 0 | { |
697 | 0 | const PreCalcValues& pcv = *cs.pcv; |
698 | 0 | m_ctuAddrMap.resize( pcv.sizeInCtus, 0 ); |
699 | 0 | int addr = 0; |
700 | 0 | for( int i = 1; i < pcv.sizeInCtus; i++ ) |
701 | 0 | { |
702 | 0 | int x = addr % pcv.widthInCtus; |
703 | 0 | int y = addr / pcv.widthInCtus; |
704 | 0 | x -= 1; |
705 | 0 | y += 1; |
706 | 0 | if( x < 0 || y >= pcv.heightInCtus ) |
707 | 0 | { |
708 | 0 | x += 1 + y; |
709 | 0 | y = 0; |
710 | 0 | } |
711 | 0 | if( x >= pcv.widthInCtus ) |
712 | 0 | { |
713 | 0 | y += ( x - pcv.widthInCtus ) + 1; |
714 | 0 | x = pcv.widthInCtus - 1; |
715 | 0 | } |
716 | 0 | addr = y * pcv.widthInCtus + x; |
717 | 0 | m_ctuAddrMap[ i ] = addr; |
718 | 0 | } |
719 | 0 | } |
720 | | }; |
721 | | |
722 | | void EncSlice::saoDisabledRate( CodingStructure& cs, SAOBlkParam* reconParams ) |
723 | 0 | { |
724 | 0 | EncSampleAdaptiveOffset::disabledRate( cs, m_saoDisabledRate, reconParams, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat ); |
725 | 0 | } |
726 | | |
727 | | void EncSlice::finishCompressSlice( Picture* pic, Slice& slice ) |
728 | 1.29k | { |
729 | 1.29k | CodingStructure& cs = *pic->cs; |
730 | | |
731 | | // finalize |
732 | 1.29k | if( slice.sps->saoEnabled && pic->useSAO ) |
733 | 1.29k | { |
734 | | // store disabled statistics |
735 | 1.29k | if( !m_pcEncCfg->m_numThreads ) |
736 | 0 | saoDisabledRate( cs, &m_saoReconParams[ 0 ] ); |
737 | | |
738 | | // set slice header flags |
739 | 1.29k | CHECK( m_saoEnabled[ COMP_Cb ] != m_saoEnabled[ COMP_Cr ], "Unspecified error"); |
740 | 1.29k | for( auto s : pic->slices ) |
741 | 1.29k | { |
742 | 1.29k | s->saoEnabled[ CH_L ] = m_saoEnabled[ COMP_Y ]; |
743 | 1.29k | s->saoEnabled[ CH_C ] = m_saoEnabled[ COMP_Cb ]; |
744 | 1.29k | } |
745 | 1.29k | } |
746 | 1.29k | } |
747 | | |
748 | | void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const unsigned boundingCtuTsAddr ) |
749 | 1.29k | { |
750 | 1.29k | PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs ); |
751 | 1.29k | CodingStructure& cs = *pic->cs; |
752 | 1.29k | Slice& slice = *cs.slice; |
753 | 1.29k | const PreCalcValues& pcv = *cs.pcv; |
754 | | |
755 | | // initialization |
756 | 1.29k | if( slice.sps->jointCbCr ) |
757 | 1.29k | { |
758 | 1.29k | setJointCbCrModes( cs, Position(0, 0), cs.area.lumaSize() ); |
759 | 1.29k | } |
760 | | |
761 | 1.29k | if( slice.sps->saoEnabled && pic->useSAO ) |
762 | 1.29k | { |
763 | | // check SAO enabled or disabled |
764 | 1.29k | EncSampleAdaptiveOffset::decidePicParams( cs, m_saoDisabledRate, m_saoEnabled, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat ); |
765 | | |
766 | 1.29k | m_saoAllDisabled = true; |
767 | 5.19k | for( int compIdx = 0; compIdx < getNumberValidComponents( pcv.chrFormat ); compIdx++ ) |
768 | 3.89k | { |
769 | 3.89k | m_saoAllDisabled &= ! m_saoEnabled[ compIdx ]; |
770 | 3.89k | } |
771 | | |
772 | 1.29k | std::fill( m_saoReconParams.begin(), m_saoReconParams.end(), SAOBlkParam() ); |
773 | 1.29k | } |
774 | 0 | else |
775 | 0 | { |
776 | 0 | m_saoAllDisabled = true; |
777 | 0 | } |
778 | | |
779 | 1.29k | if( slice.sps->alfEnabled ) |
780 | 1.29k | { |
781 | 1.29k | m_pALF->initEncProcess( slice ); |
782 | 1.29k | } |
783 | | |
784 | 1.29k | std::fill( m_processStates.begin(), m_processStates.end(), CTU_ENCODE ); |
785 | | |
786 | | // fill encoder parameter list |
787 | 1.29k | int idx = 0; |
788 | 1.29k | const std::vector<int> base = slice.sliceMap.ctuAddrInSlice; |
789 | 1.29k | auto ctuIter = CtuTsIterator( cs, startCtuTsAddr, boundingCtuTsAddr, &m_ctuAddrMap, m_pcEncCfg->m_numThreads > 0 ); |
790 | 1.29k | for( auto ctuPos : ctuIter ) |
791 | 4.05k | { |
792 | 4.05k | ctuEncParams[ idx ].pic = pic; |
793 | 4.05k | ctuEncParams[ idx ].encSlice = this; |
794 | 4.05k | ctuEncParams[ idx ].ctuRsAddr = ctuPos.ctuRsAddr; |
795 | 4.05k | ctuEncParams[ idx ].ctuPosX = ctuPos.ctuPosX; |
796 | 4.05k | ctuEncParams[ idx ].ctuPosY = ctuPos.ctuPosY; |
797 | 4.05k | ctuEncParams[ idx ].ctuArea = UnitArea( pic->chromaFormat, slice.pps->pcv->getCtuArea( ctuPos.ctuPosX, ctuPos.ctuPosY ) ); |
798 | | |
799 | 4.05k | if( m_pcEncCfg->m_numThreads > 0 ) |
800 | 4.05k | { |
801 | 4.05k | ctuEncParams[idx].tileLineResIdx = slice.pps->getTileLineId( ctuPos.ctuPosX, ctuPos.ctuPosY ); |
802 | 4.05k | } |
803 | 0 | else |
804 | 0 | { |
805 | 0 | ctuEncParams[idx].tileLineResIdx = 0; |
806 | 0 | } |
807 | 4.05k | idx++; |
808 | 4.05k | } |
809 | | |
810 | | //for( int i = 0; i < idx; i++ ) |
811 | | //{ |
812 | | // for( int j = i; j < idx; j++ ) |
813 | | // { |
814 | | // if( ctuEncParams[i].tileLineResIdx != ctuEncParams[j].tileLineResIdx ) continue; |
815 | | // |
816 | | // CHECK( ctuEncParams[i].ctuPosY != ctuEncParams[j].ctuPosY, "Not the same CTU line!" ); |
817 | | // CHECK( slice.pps->getTileIdx( ctuEncParams[i].ctuPosX, ctuEncParams[i].ctuPosY ) != slice.pps->getTileIdx( ctuEncParams[j].ctuPosX, ctuEncParams[j].ctuPosY ), "Not the same tile!" ); |
818 | | // } |
819 | | //} |
820 | | |
821 | 1.29k | CHECK( idx != pcv.sizeInCtus, "array index out of bounds" ); |
822 | | |
823 | | // process ctu's until last ctu is done |
824 | 1.29k | if( m_pcEncCfg->m_numThreads > 0 ) |
825 | 1.29k | { |
826 | 1.29k | for( auto& ctuEncParam : ctuEncParams ) |
827 | 4.05k | { |
828 | 4.05k | m_threadPool->addBarrierTask( EncSlice::xProcessCtuTask<false>, |
829 | 4.05k | &ctuEncParam, |
830 | 4.05k | m_ctuTasksDoneCounter, |
831 | 4.05k | nullptr, |
832 | 4.05k | {}, |
833 | 4.05k | EncSlice::xProcessCtuTask<true> ); |
834 | 4.05k | } |
835 | 1.29k | } |
836 | 0 | else |
837 | 0 | { |
838 | 0 | do |
839 | 0 | { |
840 | 0 | for( auto& ctuEncParam : ctuEncParams ) |
841 | 0 | { |
842 | 0 | if( m_processStates[ctuEncParam.ctuRsAddr] != PROCESS_DONE ) |
843 | 0 | EncSlice::xProcessCtuTask<false>( 0, &ctuEncParam ); |
844 | 0 | } |
845 | 0 | DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_LUMA_LF, cs, cs.getRecoBuf(), COMP_Y ); |
846 | 0 | DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cb ); |
847 | 0 | DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cr ); |
848 | 0 | DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_LUMA_SAO, cs, cs.getRecoBuf(), COMP_Y ); |
849 | 0 | DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cb ); |
850 | 0 | DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cr ); |
851 | 0 | } |
852 | 0 | while( m_processStates[ boundingCtuTsAddr - 1 ] != PROCESS_DONE ); |
853 | 0 | } |
854 | 1.29k | } |
855 | | |
856 | | inline bool checkCtuTaskNbTop( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false ) |
857 | 835k | { |
858 | 835k | return ctuPosY > 0 && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus ] <= tskType; |
859 | 835k | } |
860 | | |
861 | | inline bool checkCtuTaskNbBot( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false ) |
862 | 313k | { |
863 | 313k | return ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, 1 ) ) && processStates[ ctuRsAddr + pps.pcv->widthInCtus ] <= tskType; |
864 | 313k | } |
865 | | |
866 | | inline bool checkCtuTaskNbRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false ) |
867 | 665k | { |
868 | 665k | return ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, 0 ) ) && processStates[ ctuRsAddr + 1 ] <= tskType; |
869 | 665k | } |
870 | | |
871 | | inline bool checkCtuTaskNbTopRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false ) |
872 | 259k | { |
873 | 259k | return ctuPosY > 0 && ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus + 1 ] <= tskType; |
874 | 259k | } |
875 | | |
876 | | inline bool checkCtuTaskNbBotRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, const int rightOffset = 1, bool override = false ) |
877 | 8.97M | { |
878 | 8.97M | return ctuPosX + rightOffset < pps.pcv->widthInCtus && ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, rightOffset, 1 ) ) && processStates[ ctuRsAddr + rightOffset + pps.pcv->widthInCtus ] <= tskType; |
879 | 8.97M | } |
880 | | |
881 | | template<bool checkReadyState> |
882 | | bool EncSlice::xProcessCtuTask( int threadIdx, void* taskParam ) |
883 | 131M | { |
884 | 131M | CtuEncParam* ctuEncParam = static_cast<CtuEncParam*>( taskParam ); |
885 | 131M | Picture* pic = ctuEncParam->pic; |
886 | 131M | EncSlice* encSlice = ctuEncParam->encSlice; |
887 | 131M | CodingStructure& cs = *pic->cs; |
888 | 131M | Slice& slice = *cs.slice; |
889 | 131M | const PPS& pps = *slice.pps; |
890 | 131M | const PreCalcValues& pcv = *cs.pcv; |
891 | 131M | const int ctuRsAddr = ctuEncParam->ctuRsAddr; |
892 | 131M | const int ctuPosX = ctuEncParam->ctuPosX; |
893 | 131M | const int ctuPosY = ctuEncParam->ctuPosY; |
894 | 131M | const int x = ctuPosX << pcv.maxCUSizeLog2; |
895 | 131M | const int y = ctuPosY << pcv.maxCUSizeLog2; |
896 | 131M | const int width = std::min( pcv.maxCUSize, pcv.lumaWidth - x ); |
897 | 131M | const int height = std::min( pcv.maxCUSize, pcv.lumaHeight - y ); |
898 | 131M | const int ctuStride = pcv.widthInCtus; |
899 | 131M | const int lineIdx = ctuEncParam->tileLineResIdx; |
900 | 131M | ProcessCtuState* processStates = encSlice->m_processStates.data(); |
901 | 131M | const UnitArea& ctuArea = ctuEncParam->ctuArea; |
902 | 131M | const bool wppSyncEnabled = cs.sps->entropyCodingSyncEnabled; |
903 | 131M | const TaskType currState = processStates[ ctuRsAddr ]; |
904 | 131M | const unsigned syncLines = encSlice->m_pcEncCfg->m_ifpLines; |
905 | | |
906 | 131M | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) ); |
907 | 131M | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) ); |
908 | 131M | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) ); |
909 | | |
910 | | // process ctu's line wise from left to right |
911 | 131M | const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc; |
912 | 131M | if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ) |
913 | 0 | ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles |
914 | 131M | else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE ) |
915 | 71.8M | return false; |
916 | | |
917 | 59.2M | switch( currState ) |
918 | 59.2M | { |
919 | | // encode |
920 | 28.5M | case CTU_ENCODE: |
921 | 28.5M | { |
922 | | // CTU line-wise inter-frame parallel processing synchronization |
923 | 28.5M | if( syncLines ) |
924 | 0 | { |
925 | 0 | const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ); |
926 | 0 | if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) ) |
927 | 0 | { |
928 | 0 | return false; |
929 | 0 | } |
930 | 0 | } |
931 | | |
932 | | // general wpp conditions, top and top-right ctu have to be encoded |
933 | 28.5M | if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) ) |
934 | 0 | ; // allow parallel processing of CTU-encoding on independent tiles |
935 | 28.5M | else if( ctuPosY > 0 && processStates[ ctuRsAddr - ctuStride ] <= CTU_ENCODE ) |
936 | 23.3M | return false; |
937 | 5.17M | else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled ) |
938 | 5.16M | return false; |
939 | | |
940 | 8.07k | if( checkReadyState ) |
941 | 4.04k | return true; |
942 | | |
943 | | #ifdef TRACE_ENABLE_ITT |
944 | | std::stringstream ss; |
945 | | ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX; |
946 | | __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() ); |
947 | | #endif |
948 | 4.03k | ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode ); |
949 | | |
950 | 4.03k | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; |
951 | 4.03k | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; |
952 | 4.03k | EncCu& encCu = taskRsrc->m_encCu; |
953 | | |
954 | 4.03k | encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs ); |
955 | 4.03k | encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY ); |
956 | | |
957 | | // cleanup line memory when last ctu in line done to reduce overall memory consumption |
958 | 4.05k | if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) ) |
959 | 2.26k | { |
960 | 2.26k | lineEncRsrc->m_AffineProfList .resetAffineMVList(); |
961 | 2.26k | lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList(); |
962 | 2.26k | lineEncRsrc->m_ReuseUniMv .resetReusedUniMvs(); |
963 | 2.26k | lineEncRsrc->m_CachedBvs .resetIbcBvCand(); |
964 | 2.26k | } |
965 | | |
966 | 4.03k | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) ); |
967 | 4.03k | ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode ); |
968 | | |
969 | 4.03k | processStates[ ctuRsAddr ] = RESHAPE_LF_VER; |
970 | 4.03k | } |
971 | 0 | break; |
972 | | |
973 | | // reshape + vertical loopfilter |
974 | 17.5M | case RESHAPE_LF_VER: |
975 | 17.5M | { |
976 | | // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC) |
977 | 17.5M | const int tileCol = slice.pps->ctuToTileCol[ctuPosX]; |
978 | 17.5M | const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; |
979 | 17.5M | const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX ); |
980 | | |
981 | 17.5M | const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1; |
982 | | |
983 | | // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles |
984 | 17.5M | if( hasTiles ) |
985 | 0 | { |
986 | 0 | if( ctuPosY > 0 ) |
987 | 0 | { |
988 | 0 | for( int i = -!!ctuPosX; i <= checkRight; i++ ) |
989 | 0 | if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE ) |
990 | 0 | return false; |
991 | 0 | } |
992 | 0 | } |
993 | | |
994 | | // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too) |
995 | | // check right with max offset (due to WPP condition above, this implies top-right has been already encoded) |
996 | 26.4M | for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ ) |
997 | 17.5M | if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE ) |
998 | 8.56M | return false; |
999 | | |
1000 | | // check bottom right with 1 CTU delay (this is only required for intra pred) |
1001 | | // at the right picture border this will check the bottom CTU |
1002 | 8.96M | const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX ); |
1003 | 8.96M | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) |
1004 | 8.95M | return false; |
1005 | | |
1006 | 8.02k | if( checkReadyState ) |
1007 | 4.04k | return true; |
1008 | | |
1009 | 3.97k | ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer ); |
1010 | | |
1011 | | // reshape |
1012 | 3.97k | if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled ) |
1013 | 0 | { |
1014 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L ); |
1015 | 0 | PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height ); |
1016 | 0 | reco.rspSignal( pic->reshapeData.getInvLUT() ); |
1017 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1018 | 0 | } |
1019 | | |
1020 | | // loopfilter |
1021 | 3.97k | if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled ) |
1022 | 4.04k | { |
1023 | 4.04k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L ); |
1024 | | // calculate filter strengths |
1025 | 4.04k | encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true ); |
1026 | | |
1027 | | // vertical filter |
1028 | 4.04k | PelUnitBuf reco = cs.picture->getRecoBuf(); |
1029 | 4.04k | encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco ); |
1030 | 4.04k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1031 | 4.04k | } |
1032 | | |
1033 | 3.97k | ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer ); |
1034 | | |
1035 | 3.97k | processStates[ ctuRsAddr ] = LF_HOR; |
1036 | 3.97k | } |
1037 | 0 | break; |
1038 | | |
1039 | | // horizontal loopfilter |
1040 | 507k | case LF_HOR: |
1041 | 507k | { |
1042 | | // ensure horizontal ordering (from top to bottom) |
1043 | 507k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) ) |
1044 | 168k | return false; |
1045 | | |
1046 | | // ensure vertical loop filter of neighbor ctu's will not modify current residual |
1047 | | // check top, top-right and right ctu |
1048 | | // (top, top-right checked implicitly due to ordering check above) |
1049 | 339k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) |
1050 | 331k | return false; |
1051 | | |
1052 | 8.05k | if( checkReadyState ) |
1053 | 4.05k | return true; |
1054 | | |
1055 | 4.00k | ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor ); |
1056 | | |
1057 | 4.00k | if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled ) |
1058 | 4.05k | { |
1059 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L ); |
1060 | 4.05k | PelUnitBuf reco = cs.picture->getRecoBuf(); |
1061 | 4.05k | encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco ); |
1062 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1063 | 4.05k | } |
1064 | | |
1065 | 4.00k | ITT_TASKEND( itt_domain_encode, itt_handle_lfHor ); |
1066 | | |
1067 | 4.00k | processStates[ ctuRsAddr ] = SAO_FILTER; |
1068 | 4.00k | } |
1069 | 0 | break; |
1070 | | |
1071 | | // SAO filter |
1072 | 318k | case SAO_FILTER: |
1073 | 318k | { |
1074 | | // general wpp conditions, top and top-right ctu have to be filtered |
1075 | 318k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false; |
1076 | 259k | if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false; |
1077 | | |
1078 | | // ensure loop filter of neighbor ctu's will not modify current residual |
1079 | | // sao processing dependents on +1 pixel to each side |
1080 | | // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked |
1081 | 234k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, true ) ) return false; |
1082 | 217k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, true ) ) return false; |
1083 | 11.3k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false; |
1084 | | |
1085 | 8.08k | if( checkReadyState ) |
1086 | 4.05k | return true; |
1087 | | |
1088 | 4.03k | ITT_TASKSTART( itt_domain_encode, itt_handle_sao ); |
1089 | | |
1090 | | // SAO filter |
1091 | 4.05k | if( slice.sps->saoEnabled && pic->useSAO ) |
1092 | 4.05k | { |
1093 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L ); |
1094 | 4.05k | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; |
1095 | 4.05k | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; |
1096 | 4.05k | EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao; |
1097 | | |
1098 | 4.05k | encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache ); |
1099 | 4.05k | encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY ); |
1100 | 4.05k | encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr ); |
1101 | 4.05k | encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() ); |
1102 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1103 | 4.05k | } |
1104 | | |
1105 | | // ALF border extension |
1106 | 4.03k | if( cs.sps->alfEnabled ) |
1107 | 4.05k | { |
1108 | | // we have to do some kind of position aware boundary padding |
1109 | | // it's done here because the conditions are readable |
1110 | 4.05k | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); |
1111 | 4.05k | const int fltSize = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1; |
1112 | 4.05k | const int xL = ( ctuPosX == 0 ) ? ( x-fltSize ) : ( x ); |
1113 | 4.05k | const int xR = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width ); |
1114 | | |
1115 | 4.05k | if( ctuPosX == 0 ) recoBuf.extendBorderPelLft( y, height, fltSize ); |
1116 | 4.05k | if( ctuPosX+1 == pcv.widthInCtus ) recoBuf.extendBorderPelRgt( y, height, fltSize ); |
1117 | 4.05k | if( ctuPosY == 0 ) recoBuf.extendBorderPelTop( xL, xR-xL, fltSize ); |
1118 | 4.05k | if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize ); |
1119 | | |
1120 | 4.05k | encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY); |
1121 | 4.05k | } |
1122 | | |
1123 | | // DMVR refinement can be stored now |
1124 | 4.05k | if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag ) |
1125 | 4.05k | { |
1126 | 4.05k | CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY ); |
1127 | 4.05k | } |
1128 | 4.03k | ITT_TASKEND( itt_domain_encode, itt_handle_sao ); |
1129 | | |
1130 | 4.03k | const int tileCol = slice.pps->ctuToTileCol[ctuPosX]; |
1131 | 4.03k | const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; |
1132 | 4.03k | if( ctuPosX == lastCtuColInTileRow ) |
1133 | 2.26k | { |
1134 | 2.26k | processStates[ctuRsAddr] = ALF_GET_STATISTICS; |
1135 | 2.26k | } |
1136 | 1.77k | else |
1137 | 1.77k | { |
1138 | 1.77k | processStates[ctuRsAddr] = PROCESS_DONE; |
1139 | 1.77k | return true; |
1140 | 1.77k | } |
1141 | 4.03k | } |
1142 | 2.26k | break; |
1143 | | |
1144 | 90.7k | case ALF_GET_STATISTICS: |
1145 | 90.7k | { |
1146 | | // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's) |
1147 | | // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked |
1148 | 90.7k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; |
1149 | 90.7k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; |
1150 | 4.52k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; |
1151 | | |
1152 | 4.52k | if( checkReadyState ) |
1153 | 2.26k | return true; |
1154 | | |
1155 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat ); |
1156 | | |
1157 | | // ALF pre-processing |
1158 | 2.26k | if( slice.sps->alfEnabled ) |
1159 | 2.26k | { |
1160 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); |
1161 | 2.26k | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); |
1162 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; |
1163 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) |
1164 | 4.05k | { |
1165 | 4.05k | encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); |
1166 | 4.05k | } |
1167 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1168 | 2.26k | } |
1169 | | |
1170 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat ); |
1171 | | |
1172 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) |
1173 | 2.26k | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; |
1174 | 2.26k | processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER; |
1175 | 2.26k | } |
1176 | 0 | break; |
1177 | | |
1178 | 1.98M | case ALF_DERIVE_FILTER: |
1179 | 1.98M | { |
1180 | 1.98M | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; |
1181 | 1.98M | if( ctuRsAddr == deriveFilterCtu ) |
1182 | 1.98M | { |
1183 | | // ensure statistics from all previous ctu's have been collected |
1184 | 1.98M | int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1; |
1185 | 2.00M | for( int y = 0; y < numCheckLines; y++ ) |
1186 | 2.00M | { |
1187 | 2.02M | for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) |
1188 | 2.00M | { |
1189 | 2.00M | const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; |
1190 | 2.00M | if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS ) |
1191 | 1.98M | return false; |
1192 | 2.00M | } |
1193 | 2.00M | } |
1194 | 1.98M | } |
1195 | 0 | else if( syncLines ) |
1196 | 0 | { |
1197 | | // ALF bitstream coding dependency for the sub-sequent ctu-lines |
1198 | 0 | if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) |
1199 | 0 | return false; |
1200 | 0 | } |
1201 | 2.59k | if( checkReadyState ) |
1202 | 1.29k | return true; |
1203 | | |
1204 | 1.29k | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive ); |
1205 | | // ALF post-processing |
1206 | 1.29k | if( slice.sps->alfEnabled ) |
1207 | 1.29k | { |
1208 | 1.29k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); |
1209 | 1.29k | if( ctuRsAddr == deriveFilterCtu ) |
1210 | 1.29k | { |
1211 | 1.29k | encSlice->m_pALF->initDerivation( slice ); |
1212 | 1.29k | encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 ); |
1213 | 1.29k | encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false ); |
1214 | 1.29k | } |
1215 | 0 | else if( syncLines ) |
1216 | 0 | { |
1217 | | // in sync lines mode: derive/select filter for the remaining lines |
1218 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; |
1219 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; |
1220 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; |
1221 | 0 | for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++) |
1222 | 0 | { |
1223 | 0 | encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu ); |
1224 | 0 | } |
1225 | 0 | } |
1226 | 1.29k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1227 | 1.29k | } |
1228 | | |
1229 | 1.29k | ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive ); |
1230 | 1.29k | processStates[ ctuRsAddr ] = ALF_RECONSTRUCT; |
1231 | 1.29k | } |
1232 | 0 | break; |
1233 | | |
1234 | 10.0M | case ALF_RECONSTRUCT: |
1235 | 10.0M | { |
1236 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) |
1237 | 10.0M | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; |
1238 | 10.0M | if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT ) |
1239 | 10.0M | return false; |
1240 | 4.52k | else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 ) |
1241 | 0 | { |
1242 | 0 | const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus(); |
1243 | 0 | const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 ); |
1244 | 0 | if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) |
1245 | 0 | return false; |
1246 | 0 | } |
1247 | | |
1248 | 4.52k | if( checkReadyState ) |
1249 | 2.26k | return true; |
1250 | | |
1251 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon ); |
1252 | | |
1253 | 2.26k | if( slice.sps->alfEnabled ) |
1254 | 2.26k | { |
1255 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); |
1256 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; |
1257 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) |
1258 | 4.05k | { |
1259 | 4.05k | encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); |
1260 | 4.05k | } |
1261 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1262 | 2.26k | } |
1263 | | |
1264 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon ); |
1265 | 2.26k | processStates[ctuRsAddr] = CCALF_GET_STATISTICS; |
1266 | 2.26k | } |
1267 | | // dont break, no additional deps, can continue straigt away! |
1268 | | //break; |
1269 | | |
1270 | 8.82k | case CCALF_GET_STATISTICS: |
1271 | 8.82k | { |
1272 | 8.82k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false; |
1273 | 4.95k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false; |
1274 | | |
1275 | 3.08k | if( checkReadyState ) |
1276 | 824 | return true; |
1277 | | |
1278 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat ); |
1279 | | |
1280 | | // ALF pre-processing |
1281 | 2.26k | if( slice.sps->ccalfEnabled ) |
1282 | 2.26k | { |
1283 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L); |
1284 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; |
1285 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) |
1286 | 4.05k | { |
1287 | 4.05k | encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); |
1288 | 4.05k | encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); |
1289 | 4.05k | } |
1290 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); |
1291 | 2.26k | } |
1292 | | |
1293 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat ); |
1294 | | |
1295 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) |
1296 | 2.26k | processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER; |
1297 | 2.26k | } |
1298 | 0 | break; |
1299 | | |
1300 | 209k | case CCALF_DERIVE_FILTER: |
1301 | 209k | { |
1302 | | // synchronization dependencies |
1303 | 209k | const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu; |
1304 | 209k | if( ctuRsAddr == deriveFilterCtu ) |
1305 | 209k | { |
1306 | | // ensure statistics from all previous ctu's have been collected |
1307 | 209k | int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1; |
1308 | 218k | for( int y = 0; y < numCheckLines; y++ ) |
1309 | 216k | { |
1310 | 225k | for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) |
1311 | 216k | { |
1312 | 216k | const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; |
1313 | 216k | if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS ) |
1314 | 207k | return false; |
1315 | 216k | } |
1316 | 216k | } |
1317 | 209k | } |
1318 | 0 | else if( syncLines ) |
1319 | 0 | { |
1320 | | // ALF bitstream coding dependency for the sub-sequent CTU-lines |
1321 | 0 | if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) |
1322 | 0 | return false; |
1323 | 0 | } |
1324 | 2.59k | if( checkReadyState ) |
1325 | 1.29k | return true; |
1326 | | |
1327 | 1.29k | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive ); |
1328 | | |
1329 | | // start task |
1330 | 1.29k | if( slice.sps->ccalfEnabled ) |
1331 | 1.29k | { |
1332 | 1.29k | if( ctuRsAddr == deriveFilterCtu ) |
1333 | 1.29k | { |
1334 | 1.29k | encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 ); |
1335 | 1.29k | } |
1336 | 0 | else if( syncLines ) |
1337 | 0 | { |
1338 | | // in sync lines mode: derive/select filter for the remaining lines |
1339 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; |
1340 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; |
1341 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; |
1342 | 0 | encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr ); |
1343 | 0 | encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr ); |
1344 | 0 | } |
1345 | 1.29k | } |
1346 | 1.29k | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive ); |
1347 | | |
1348 | 1.29k | processStates[ctuRsAddr] = CCALF_RECONSTRUCT; |
1349 | 1.29k | } |
1350 | 0 | break; |
1351 | | |
1352 | 13.5k | case CCALF_RECONSTRUCT: |
1353 | 13.5k | { |
1354 | | // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) |
1355 | 13.5k | const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu; |
1356 | 13.5k | if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT ) |
1357 | 9.05k | return false; |
1358 | | |
1359 | 4.53k | if( syncLines ) |
1360 | 0 | { |
1361 | | // ensure line-by-line reconstruction due to line synchronization |
1362 | 0 | if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false; |
1363 | | // check bottom due to rec. buffer usage in ccalf statistics |
1364 | 0 | if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false; |
1365 | 0 | } |
1366 | | |
1367 | 4.53k | if( checkReadyState ) |
1368 | 2.26k | return true; |
1369 | | |
1370 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon ); |
1371 | | |
1372 | 2.26k | if( slice.sps->ccalfEnabled ) |
1373 | 2.26k | { |
1374 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; |
1375 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) |
1376 | 4.04k | { |
1377 | 4.04k | encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); |
1378 | 4.04k | encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); |
1379 | 4.04k | } |
1380 | 2.26k | } |
1381 | | |
1382 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon ); |
1383 | | |
1384 | | // extend pic border |
1385 | | // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done |
1386 | 2.26k | if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols ) |
1387 | 2.26k | { |
1388 | 2.26k | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); |
1389 | 2.26k | const int margin = cs.picture->margin; |
1390 | 2.26k | recoBuf.extendBorderPelLft( y, height, margin ); |
1391 | 2.26k | recoBuf.extendBorderPelRgt( y, height, margin ); |
1392 | 2.26k | if(ctuPosY == 0) |
1393 | 1.29k | recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin ); |
1394 | 2.26k | if(ctuPosY + 1 == pcv.heightInCtus) |
1395 | 1.29k | recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin ); |
1396 | | |
1397 | | // for IFP lines synchro, do an additional increment signaling that CTU row is ready |
1398 | 2.26k | if( syncLines ) |
1399 | 0 | ++(pic->m_tileColsDone->at( ctuPosY )); |
1400 | 2.26k | } |
1401 | | |
1402 | | // perform finish only once for whole picture |
1403 | 2.26k | const unsigned finishCtu = pcv.sizeInCtus - 1; |
1404 | 2.26k | if( ctuRsAddr < finishCtu ) |
1405 | 967 | { |
1406 | 967 | processStates[ctuRsAddr] = PROCESS_DONE; |
1407 | | // processing done => terminate thread |
1408 | 967 | return true; |
1409 | 967 | } |
1410 | 1.29k | processStates[ctuRsAddr] = FINISH_SLICE; |
1411 | 1.29k | } |
1412 | | |
1413 | 19.8k | case FINISH_SLICE: |
1414 | 19.8k | { |
1415 | 19.8k | CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" ); |
1416 | | |
1417 | | // ensure all coding tasks have been done for all previous ctu's |
1418 | 43.2k | for( int i = 0; i < ctuRsAddr; i++ ) |
1419 | 41.2k | if( processStates[ i ] < FINISH_SLICE ) |
1420 | 17.9k | return false; |
1421 | | |
1422 | 1.94k | if( checkReadyState ) |
1423 | 647 | return true; |
1424 | | |
1425 | 1.29k | encSlice->finishCompressSlice( cs.picture, slice ); |
1426 | | |
1427 | 1.29k | processStates[ ctuRsAddr ] = PROCESS_DONE; |
1428 | | // processing done => terminate thread |
1429 | 1.29k | return true; |
1430 | 1.94k | } |
1431 | | |
1432 | 0 | case PROCESS_DONE: |
1433 | 0 | CHECK( true, "process state is PROCESS_DONE, but thread is still running" ); |
1434 | 0 | return true; |
1435 | | |
1436 | 0 | default: |
1437 | 0 | CHECK( true, "unknown process state" ); |
1438 | 0 | return true; |
1439 | 59.2M | } |
1440 | | |
1441 | 21.5k | return false; |
1442 | 59.2M | } bool vvenc::EncSlice::xProcessCtuTask<false>(int, void*) Line | Count | Source | 883 | 27.0k | { | 884 | 27.0k | CtuEncParam* ctuEncParam = static_cast<CtuEncParam*>( taskParam ); | 885 | 27.0k | Picture* pic = ctuEncParam->pic; | 886 | 27.0k | EncSlice* encSlice = ctuEncParam->encSlice; | 887 | 27.0k | CodingStructure& cs = *pic->cs; | 888 | 27.0k | Slice& slice = *cs.slice; | 889 | 27.0k | const PPS& pps = *slice.pps; | 890 | 27.0k | const PreCalcValues& pcv = *cs.pcv; | 891 | 27.0k | const int ctuRsAddr = ctuEncParam->ctuRsAddr; | 892 | 27.0k | const int ctuPosX = ctuEncParam->ctuPosX; | 893 | 27.0k | const int ctuPosY = ctuEncParam->ctuPosY; | 894 | 27.0k | const int x = ctuPosX << pcv.maxCUSizeLog2; | 895 | 27.0k | const int y = ctuPosY << pcv.maxCUSizeLog2; | 896 | 27.0k | const int width = std::min( pcv.maxCUSize, pcv.lumaWidth - x ); | 897 | 27.0k | const int height = std::min( pcv.maxCUSize, pcv.lumaHeight - y ); | 898 | 27.0k | const int ctuStride = pcv.widthInCtus; | 899 | 27.0k | const int lineIdx = ctuEncParam->tileLineResIdx; | 900 | 27.0k | ProcessCtuState* processStates = encSlice->m_processStates.data(); | 901 | 27.0k | const UnitArea& ctuArea = ctuEncParam->ctuArea; | 902 | 27.0k | const bool wppSyncEnabled = cs.sps->entropyCodingSyncEnabled; | 903 | 27.0k | const TaskType currState = processStates[ ctuRsAddr ]; | 904 | 27.0k | const unsigned syncLines = encSlice->m_pcEncCfg->m_ifpLines; | 905 | | | 906 | 27.0k | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) ); | 907 | 27.0k | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) ); | 908 | 27.0k | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) ); | 909 | | | 910 | | // process ctu's line wise from left to right | 911 | 27.0k | const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc; | 912 | 27.0k | if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ) | 913 | 0 | ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles | 914 | 27.0k | else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE ) | 915 | 0 | return false; | 916 | | | 917 | 27.0k | switch( currState ) | 918 | 27.0k | { | 919 | | // encode | 920 | 4.05k | case CTU_ENCODE: | 921 | 4.05k | { | 922 | | // CTU line-wise inter-frame parallel processing synchronization | 923 | 4.05k | if( syncLines ) | 924 | 0 | { | 925 | 0 | const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ); | 926 | 0 | if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) ) | 927 | 0 | { | 928 | 0 | return false; | 929 | 0 | } | 930 | 0 | } | 931 | | | 932 | | // general wpp conditions, top and top-right ctu have to be encoded | 933 | 4.05k | if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) ) | 934 | 0 | ; // allow parallel processing of CTU-encoding on independent tiles | 935 | 4.05k | else if( ctuPosY > 0 && processStates[ ctuRsAddr - ctuStride ] <= CTU_ENCODE ) | 936 | 0 | return false; | 937 | 4.05k | else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled ) | 938 | 0 | return false; | 939 | | | 940 | 4.05k | if( checkReadyState ) | 941 | 0 | return true; | 942 | | | 943 | | #ifdef TRACE_ENABLE_ITT | 944 | | std::stringstream ss; | 945 | | ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX; | 946 | | __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() ); | 947 | | #endif | 948 | 4.05k | ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode ); | 949 | | | 950 | 4.05k | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 951 | 4.05k | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 952 | 4.05k | EncCu& encCu = taskRsrc->m_encCu; | 953 | | | 954 | 4.05k | encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs ); | 955 | 4.05k | encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY ); | 956 | | | 957 | | // cleanup line memory when last ctu in line done to reduce overall memory consumption | 958 | 4.05k | if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) ) | 959 | 2.26k | { | 960 | 2.26k | lineEncRsrc->m_AffineProfList .resetAffineMVList(); | 961 | 2.26k | lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList(); | 962 | 2.26k | lineEncRsrc->m_ReuseUniMv .resetReusedUniMvs(); | 963 | 2.26k | lineEncRsrc->m_CachedBvs .resetIbcBvCand(); | 964 | 2.26k | } | 965 | | | 966 | 4.05k | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) ); | 967 | 4.05k | ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode ); | 968 | | | 969 | 4.05k | processStates[ ctuRsAddr ] = RESHAPE_LF_VER; | 970 | 4.05k | } | 971 | 0 | break; | 972 | | | 973 | | // reshape + vertical loopfilter | 974 | 4.05k | case RESHAPE_LF_VER: | 975 | 4.05k | { | 976 | | // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC) | 977 | 4.05k | const int tileCol = slice.pps->ctuToTileCol[ctuPosX]; | 978 | 4.05k | const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 979 | 4.05k | const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX ); | 980 | | | 981 | 4.05k | const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1; | 982 | | | 983 | | // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles | 984 | 4.05k | if( hasTiles ) | 985 | 0 | { | 986 | 0 | if( ctuPosY > 0 ) | 987 | 0 | { | 988 | 0 | for( int i = -!!ctuPosX; i <= checkRight; i++ ) | 989 | 0 | if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE ) | 990 | 0 | return false; | 991 | 0 | } | 992 | 0 | } | 993 | | | 994 | | // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too) | 995 | | // check right with max offset (due to WPP condition above, this implies top-right has been already encoded) | 996 | 8.10k | for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ ) | 997 | 4.05k | if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE ) | 998 | 0 | return false; | 999 | | | 1000 | | // check bottom right with 1 CTU delay (this is only required for intra pred) | 1001 | | // at the right picture border this will check the bottom CTU | 1002 | 4.05k | const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX ); | 1003 | 4.05k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) | 1004 | 0 | return false; | 1005 | | | 1006 | 4.05k | if( checkReadyState ) | 1007 | 0 | return true; | 1008 | | | 1009 | 4.05k | ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer ); | 1010 | | | 1011 | | // reshape | 1012 | 4.05k | if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled ) | 1013 | 0 | { | 1014 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L ); | 1015 | 0 | PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height ); | 1016 | 0 | reco.rspSignal( pic->reshapeData.getInvLUT() ); | 1017 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1018 | 0 | } | 1019 | | | 1020 | | // loopfilter | 1021 | 4.05k | if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled ) | 1022 | 4.04k | { | 1023 | 4.04k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L ); | 1024 | | // calculate filter strengths | 1025 | 4.04k | encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true ); | 1026 | | | 1027 | | // vertical filter | 1028 | 4.04k | PelUnitBuf reco = cs.picture->getRecoBuf(); | 1029 | 4.04k | encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco ); | 1030 | 4.04k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1031 | 4.04k | } | 1032 | | | 1033 | 4.05k | ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer ); | 1034 | | | 1035 | 4.05k | processStates[ ctuRsAddr ] = LF_HOR; | 1036 | 4.05k | } | 1037 | 0 | break; | 1038 | | | 1039 | | // horizontal loopfilter | 1040 | 4.05k | case LF_HOR: | 1041 | 4.05k | { | 1042 | | // ensure horizontal ordering (from top to bottom) | 1043 | 4.05k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) ) | 1044 | 0 | return false; | 1045 | | | 1046 | | // ensure vertical loop filter of neighbor ctu's will not modify current residual | 1047 | | // check top, top-right and right ctu | 1048 | | // (top, top-right checked implicitly due to ordering check above) | 1049 | 4.05k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) | 1050 | 0 | return false; | 1051 | | | 1052 | 4.05k | if( checkReadyState ) | 1053 | 0 | return true; | 1054 | | | 1055 | 4.05k | ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor ); | 1056 | | | 1057 | 4.05k | if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled ) | 1058 | 4.05k | { | 1059 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L ); | 1060 | 4.05k | PelUnitBuf reco = cs.picture->getRecoBuf(); | 1061 | 4.05k | encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco ); | 1062 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1063 | 4.05k | } | 1064 | | | 1065 | 4.05k | ITT_TASKEND( itt_domain_encode, itt_handle_lfHor ); | 1066 | | | 1067 | 4.05k | processStates[ ctuRsAddr ] = SAO_FILTER; | 1068 | 4.05k | } | 1069 | 0 | break; | 1070 | | | 1071 | | // SAO filter | 1072 | 4.05k | case SAO_FILTER: | 1073 | 4.05k | { | 1074 | | // general wpp conditions, top and top-right ctu have to be filtered | 1075 | 4.05k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false; | 1076 | 4.05k | if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false; | 1077 | | | 1078 | | // ensure loop filter of neighbor ctu's will not modify current residual | 1079 | | // sao processing dependents on +1 pixel to each side | 1080 | | // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked | 1081 | 4.05k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, true ) ) return false; | 1082 | 4.05k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, true ) ) return false; | 1083 | 4.05k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false; | 1084 | | | 1085 | 4.05k | if( checkReadyState ) | 1086 | 0 | return true; | 1087 | | | 1088 | 4.05k | ITT_TASKSTART( itt_domain_encode, itt_handle_sao ); | 1089 | | | 1090 | | // SAO filter | 1091 | 4.05k | if( slice.sps->saoEnabled && pic->useSAO ) | 1092 | 4.05k | { | 1093 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L ); | 1094 | 4.05k | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 1095 | 4.05k | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 1096 | 4.05k | EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao; | 1097 | | | 1098 | 4.05k | encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache ); | 1099 | 4.05k | encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY ); | 1100 | 4.05k | encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr ); | 1101 | 4.05k | encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() ); | 1102 | 4.05k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1103 | 4.05k | } | 1104 | | | 1105 | | // ALF border extension | 1106 | 4.05k | if( cs.sps->alfEnabled ) | 1107 | 4.05k | { | 1108 | | // we have to do some kind of position aware boundary padding | 1109 | | // it's done here because the conditions are readable | 1110 | 4.05k | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); | 1111 | 4.05k | const int fltSize = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1; | 1112 | 4.05k | const int xL = ( ctuPosX == 0 ) ? ( x-fltSize ) : ( x ); | 1113 | 4.05k | const int xR = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width ); | 1114 | | | 1115 | 4.05k | if( ctuPosX == 0 ) recoBuf.extendBorderPelLft( y, height, fltSize ); | 1116 | 4.05k | if( ctuPosX+1 == pcv.widthInCtus ) recoBuf.extendBorderPelRgt( y, height, fltSize ); | 1117 | 4.05k | if( ctuPosY == 0 ) recoBuf.extendBorderPelTop( xL, xR-xL, fltSize ); | 1118 | 4.05k | if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize ); | 1119 | | | 1120 | 4.05k | encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY); | 1121 | 4.05k | } | 1122 | | | 1123 | | // DMVR refinement can be stored now | 1124 | 4.05k | if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag ) | 1125 | 4.05k | { | 1126 | 4.05k | CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY ); | 1127 | 4.05k | } | 1128 | 4.05k | ITT_TASKEND( itt_domain_encode, itt_handle_sao ); | 1129 | | | 1130 | 4.05k | const int tileCol = slice.pps->ctuToTileCol[ctuPosX]; | 1131 | 4.05k | const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 1132 | 4.05k | if( ctuPosX == lastCtuColInTileRow ) | 1133 | 2.26k | { | 1134 | 2.26k | processStates[ctuRsAddr] = ALF_GET_STATISTICS; | 1135 | 2.26k | } | 1136 | 1.78k | else | 1137 | 1.78k | { | 1138 | 1.78k | processStates[ctuRsAddr] = PROCESS_DONE; | 1139 | 1.78k | return true; | 1140 | 1.78k | } | 1141 | 4.05k | } | 1142 | 2.26k | break; | 1143 | | | 1144 | 2.26k | case ALF_GET_STATISTICS: | 1145 | 2.26k | { | 1146 | | // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's) | 1147 | | // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked | 1148 | 2.26k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; | 1149 | 2.26k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; | 1150 | 2.26k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; | 1151 | | | 1152 | 2.26k | if( checkReadyState ) | 1153 | 0 | return true; | 1154 | | | 1155 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat ); | 1156 | | | 1157 | | // ALF pre-processing | 1158 | 2.26k | if( slice.sps->alfEnabled ) | 1159 | 2.26k | { | 1160 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); | 1161 | 2.26k | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); | 1162 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1163 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1164 | 4.05k | { | 1165 | 4.05k | encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1166 | 4.05k | } | 1167 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1168 | 2.26k | } | 1169 | | | 1170 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat ); | 1171 | | | 1172 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1173 | 2.26k | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; | 1174 | 2.26k | processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER; | 1175 | 2.26k | } | 1176 | 0 | break; | 1177 | | | 1178 | 1.29k | case ALF_DERIVE_FILTER: | 1179 | 1.29k | { | 1180 | 1.29k | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; | 1181 | 1.29k | if( ctuRsAddr == deriveFilterCtu ) | 1182 | 1.29k | { | 1183 | | // ensure statistics from all previous ctu's have been collected | 1184 | 1.29k | int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1; | 1185 | 3.56k | for( int y = 0; y < numCheckLines; y++ ) | 1186 | 2.26k | { | 1187 | 4.53k | for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) | 1188 | 2.26k | { | 1189 | 2.26k | const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 1190 | 2.26k | if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS ) | 1191 | 0 | return false; | 1192 | 2.26k | } | 1193 | 2.26k | } | 1194 | 1.29k | } | 1195 | 0 | else if( syncLines ) | 1196 | 0 | { | 1197 | | // ALF bitstream coding dependency for the sub-sequent ctu-lines | 1198 | 0 | if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) | 1199 | 0 | return false; | 1200 | 0 | } | 1201 | 1.29k | if( checkReadyState ) | 1202 | 0 | return true; | 1203 | | | 1204 | 1.29k | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive ); | 1205 | | // ALF post-processing | 1206 | 1.29k | if( slice.sps->alfEnabled ) | 1207 | 1.29k | { | 1208 | 1.29k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); | 1209 | 1.29k | if( ctuRsAddr == deriveFilterCtu ) | 1210 | 1.29k | { | 1211 | 1.29k | encSlice->m_pALF->initDerivation( slice ); | 1212 | 1.29k | encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 ); | 1213 | 1.29k | encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false ); | 1214 | 1.29k | } | 1215 | 0 | else if( syncLines ) | 1216 | 0 | { | 1217 | | // in sync lines mode: derive/select filter for the remaining lines | 1218 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 1219 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 1220 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1221 | 0 | for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++) | 1222 | 0 | { | 1223 | 0 | encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu ); | 1224 | 0 | } | 1225 | 0 | } | 1226 | 1.29k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1227 | 1.29k | } | 1228 | | | 1229 | 1.29k | ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive ); | 1230 | 1.29k | processStates[ ctuRsAddr ] = ALF_RECONSTRUCT; | 1231 | 1.29k | } | 1232 | 0 | break; | 1233 | | | 1234 | 2.26k | case ALF_RECONSTRUCT: | 1235 | 2.26k | { | 1236 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1237 | 2.26k | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; | 1238 | 2.26k | if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT ) | 1239 | 0 | return false; | 1240 | 2.26k | else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 ) | 1241 | 0 | { | 1242 | 0 | const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus(); | 1243 | 0 | const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 ); | 1244 | 0 | if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) | 1245 | 0 | return false; | 1246 | 0 | } | 1247 | | | 1248 | 2.26k | if( checkReadyState ) | 1249 | 0 | return true; | 1250 | | | 1251 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon ); | 1252 | | | 1253 | 2.26k | if( slice.sps->alfEnabled ) | 1254 | 2.26k | { | 1255 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); | 1256 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1257 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1258 | 4.05k | { | 1259 | 4.05k | encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1260 | 4.05k | } | 1261 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1262 | 2.26k | } | 1263 | | | 1264 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon ); | 1265 | 2.26k | processStates[ctuRsAddr] = CCALF_GET_STATISTICS; | 1266 | 2.26k | } | 1267 | | // dont break, no additional deps, can continue straigt away! | 1268 | | //break; | 1269 | | | 1270 | 3.08k | case CCALF_GET_STATISTICS: | 1271 | 3.08k | { | 1272 | 3.08k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false; | 1273 | 2.50k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false; | 1274 | | | 1275 | 2.26k | if( checkReadyState ) | 1276 | 0 | return true; | 1277 | | | 1278 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat ); | 1279 | | | 1280 | | // ALF pre-processing | 1281 | 2.26k | if( slice.sps->ccalfEnabled ) | 1282 | 2.26k | { | 1283 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L); | 1284 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1285 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1286 | 4.05k | { | 1287 | 4.05k | encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1288 | 4.05k | encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1289 | 4.05k | } | 1290 | 2.26k | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1291 | 2.26k | } | 1292 | | | 1293 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat ); | 1294 | | | 1295 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1296 | 2.26k | processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER; | 1297 | 2.26k | } | 1298 | 0 | break; | 1299 | | | 1300 | 1.29k | case CCALF_DERIVE_FILTER: | 1301 | 1.29k | { | 1302 | | // synchronization dependencies | 1303 | 1.29k | const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu; | 1304 | 1.29k | if( ctuRsAddr == deriveFilterCtu ) | 1305 | 1.29k | { | 1306 | | // ensure statistics from all previous ctu's have been collected | 1307 | 1.29k | int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1; | 1308 | 3.56k | for( int y = 0; y < numCheckLines; y++ ) | 1309 | 2.26k | { | 1310 | 4.53k | for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) | 1311 | 2.26k | { | 1312 | 2.26k | const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 1313 | 2.26k | if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS ) | 1314 | 0 | return false; | 1315 | 2.26k | } | 1316 | 2.26k | } | 1317 | 1.29k | } | 1318 | 0 | else if( syncLines ) | 1319 | 0 | { | 1320 | | // ALF bitstream coding dependency for the sub-sequent CTU-lines | 1321 | 0 | if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) | 1322 | 0 | return false; | 1323 | 0 | } | 1324 | 1.29k | if( checkReadyState ) | 1325 | 0 | return true; | 1326 | | | 1327 | 1.29k | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive ); | 1328 | | | 1329 | | // start task | 1330 | 1.29k | if( slice.sps->ccalfEnabled ) | 1331 | 1.29k | { | 1332 | 1.29k | if( ctuRsAddr == deriveFilterCtu ) | 1333 | 1.29k | { | 1334 | 1.29k | encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 ); | 1335 | 1.29k | } | 1336 | 0 | else if( syncLines ) | 1337 | 0 | { | 1338 | | // in sync lines mode: derive/select filter for the remaining lines | 1339 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 1340 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 1341 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1342 | 0 | encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr ); | 1343 | 0 | encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr ); | 1344 | 0 | } | 1345 | 1.29k | } | 1346 | 1.29k | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive ); | 1347 | | | 1348 | 1.29k | processStates[ctuRsAddr] = CCALF_RECONSTRUCT; | 1349 | 1.29k | } | 1350 | 0 | break; | 1351 | | | 1352 | 2.26k | case CCALF_RECONSTRUCT: | 1353 | 2.26k | { | 1354 | | // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1355 | 2.26k | const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu; | 1356 | 2.26k | if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT ) | 1357 | 0 | return false; | 1358 | | | 1359 | 2.26k | if( syncLines ) | 1360 | 0 | { | 1361 | | // ensure line-by-line reconstruction due to line synchronization | 1362 | 0 | if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false; | 1363 | | // check bottom due to rec. buffer usage in ccalf statistics | 1364 | 0 | if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false; | 1365 | 0 | } | 1366 | | | 1367 | 2.26k | if( checkReadyState ) | 1368 | 0 | return true; | 1369 | | | 1370 | 2.26k | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon ); | 1371 | | | 1372 | 2.26k | if( slice.sps->ccalfEnabled ) | 1373 | 2.26k | { | 1374 | 2.26k | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1375 | 6.31k | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1376 | 4.04k | { | 1377 | 4.04k | encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1378 | 4.04k | encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1379 | 4.04k | } | 1380 | 2.26k | } | 1381 | | | 1382 | 2.26k | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon ); | 1383 | | | 1384 | | // extend pic border | 1385 | | // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done | 1386 | 2.26k | if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols ) | 1387 | 2.26k | { | 1388 | 2.26k | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); | 1389 | 2.26k | const int margin = cs.picture->margin; | 1390 | 2.26k | recoBuf.extendBorderPelLft( y, height, margin ); | 1391 | 2.26k | recoBuf.extendBorderPelRgt( y, height, margin ); | 1392 | 2.26k | if(ctuPosY == 0) | 1393 | 1.29k | recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin ); | 1394 | 2.26k | if(ctuPosY + 1 == pcv.heightInCtus) | 1395 | 1.29k | recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin ); | 1396 | | | 1397 | | // for IFP lines synchro, do an additional increment signaling that CTU row is ready | 1398 | 2.26k | if( syncLines ) | 1399 | 0 | ++(pic->m_tileColsDone->at( ctuPosY )); | 1400 | 2.26k | } | 1401 | | | 1402 | | // perform finish only once for whole picture | 1403 | 2.26k | const unsigned finishCtu = pcv.sizeInCtus - 1; | 1404 | 2.26k | if( ctuRsAddr < finishCtu ) | 1405 | 967 | { | 1406 | 967 | processStates[ctuRsAddr] = PROCESS_DONE; | 1407 | | // processing done => terminate thread | 1408 | 967 | return true; | 1409 | 967 | } | 1410 | 1.29k | processStates[ctuRsAddr] = FINISH_SLICE; | 1411 | 1.29k | } | 1412 | | | 1413 | 1.94k | case FINISH_SLICE: | 1414 | 1.94k | { | 1415 | 1.94k | CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" ); | 1416 | | | 1417 | | // ensure all coding tasks have been done for all previous ctu's | 1418 | 5.25k | for( int i = 0; i < ctuRsAddr; i++ ) | 1419 | 3.95k | if( processStates[ i ] < FINISH_SLICE ) | 1420 | 647 | return false; | 1421 | | | 1422 | 1.29k | if( checkReadyState ) | 1423 | 0 | return true; | 1424 | | | 1425 | 1.29k | encSlice->finishCompressSlice( cs.picture, slice ); | 1426 | | | 1427 | 1.29k | processStates[ ctuRsAddr ] = PROCESS_DONE; | 1428 | | // processing done => terminate thread | 1429 | 1.29k | return true; | 1430 | 1.29k | } | 1431 | | | 1432 | 0 | case PROCESS_DONE: | 1433 | 0 | CHECK( true, "process state is PROCESS_DONE, but thread is still running" ); | 1434 | 0 | return true; | 1435 | | | 1436 | 0 | default: | 1437 | 0 | CHECK( true, "unknown process state" ); | 1438 | 0 | return true; | 1439 | 27.0k | } | 1440 | | | 1441 | 21.5k | return false; | 1442 | 27.0k | } |
bool vvenc::EncSlice::xProcessCtuTask<true>(int, void*) Line | Count | Source | 883 | 131M | { | 884 | 131M | CtuEncParam* ctuEncParam = static_cast<CtuEncParam*>( taskParam ); | 885 | 131M | Picture* pic = ctuEncParam->pic; | 886 | 131M | EncSlice* encSlice = ctuEncParam->encSlice; | 887 | 131M | CodingStructure& cs = *pic->cs; | 888 | 131M | Slice& slice = *cs.slice; | 889 | 131M | const PPS& pps = *slice.pps; | 890 | 131M | const PreCalcValues& pcv = *cs.pcv; | 891 | 131M | const int ctuRsAddr = ctuEncParam->ctuRsAddr; | 892 | 131M | const int ctuPosX = ctuEncParam->ctuPosX; | 893 | 131M | const int ctuPosY = ctuEncParam->ctuPosY; | 894 | 131M | const int x = ctuPosX << pcv.maxCUSizeLog2; | 895 | 131M | const int y = ctuPosY << pcv.maxCUSizeLog2; | 896 | 131M | const int width = std::min( pcv.maxCUSize, pcv.lumaWidth - x ); | 897 | 131M | const int height = std::min( pcv.maxCUSize, pcv.lumaHeight - y ); | 898 | 131M | const int ctuStride = pcv.widthInCtus; | 899 | 131M | const int lineIdx = ctuEncParam->tileLineResIdx; | 900 | 131M | ProcessCtuState* processStates = encSlice->m_processStates.data(); | 901 | 131M | const UnitArea& ctuArea = ctuEncParam->ctuArea; | 902 | 131M | const bool wppSyncEnabled = cs.sps->entropyCodingSyncEnabled; | 903 | 131M | const TaskType currState = processStates[ ctuRsAddr ]; | 904 | 131M | const unsigned syncLines = encSlice->m_pcEncCfg->m_ifpLines; | 905 | | | 906 | 131M | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) ); | 907 | 131M | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) ); | 908 | 131M | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) ); | 909 | | | 910 | | // process ctu's line wise from left to right | 911 | 131M | const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc; | 912 | 131M | if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ) | 913 | 0 | ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles | 914 | 131M | else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE ) | 915 | 71.8M | return false; | 916 | | | 917 | 59.2M | switch( currState ) | 918 | 59.2M | { | 919 | | // encode | 920 | 28.5M | case CTU_ENCODE: | 921 | 28.5M | { | 922 | | // CTU line-wise inter-frame parallel processing synchronization | 923 | 28.5M | if( syncLines ) | 924 | 0 | { | 925 | 0 | const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) ); | 926 | 0 | if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) ) | 927 | 0 | { | 928 | 0 | return false; | 929 | 0 | } | 930 | 0 | } | 931 | | | 932 | | // general wpp conditions, top and top-right ctu have to be encoded | 933 | 28.5M | if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) ) | 934 | 0 | ; // allow parallel processing of CTU-encoding on independent tiles | 935 | 28.5M | else if( ctuPosY > 0 && processStates[ ctuRsAddr - ctuStride ] <= CTU_ENCODE ) | 936 | 23.3M | return false; | 937 | 5.17M | else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled ) | 938 | 5.16M | return false; | 939 | | | 940 | 4.02k | if( checkReadyState ) | 941 | 4.04k | return true; | 942 | | | 943 | | #ifdef TRACE_ENABLE_ITT | 944 | | std::stringstream ss; | 945 | | ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX; | 946 | | __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() ); | 947 | | #endif | 948 | 18.4E | ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode ); | 949 | | | 950 | 18.4E | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 951 | 18.4E | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 952 | 18.4E | EncCu& encCu = taskRsrc->m_encCu; | 953 | | | 954 | 18.4E | encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs ); | 955 | 18.4E | encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY ); | 956 | | | 957 | | // cleanup line memory when last ctu in line done to reduce overall memory consumption | 958 | 18.4E | if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) ) | 959 | 0 | { | 960 | 0 | lineEncRsrc->m_AffineProfList .resetAffineMVList(); | 961 | 0 | lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList(); | 962 | 0 | lineEncRsrc->m_ReuseUniMv .resetReusedUniMvs(); | 963 | 0 | lineEncRsrc->m_CachedBvs .resetIbcBvCand(); | 964 | 0 | } | 965 | | | 966 | 18.4E | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) ); | 967 | 18.4E | ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode ); | 968 | | | 969 | 18.4E | processStates[ ctuRsAddr ] = RESHAPE_LF_VER; | 970 | 18.4E | } | 971 | 0 | break; | 972 | | | 973 | | // reshape + vertical loopfilter | 974 | 17.5M | case RESHAPE_LF_VER: | 975 | 17.5M | { | 976 | | // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC) | 977 | 17.5M | const int tileCol = slice.pps->ctuToTileCol[ctuPosX]; | 978 | 17.5M | const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 979 | 17.5M | const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX ); | 980 | | | 981 | 17.5M | const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1; | 982 | | | 983 | | // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles | 984 | 17.5M | if( hasTiles ) | 985 | 0 | { | 986 | 0 | if( ctuPosY > 0 ) | 987 | 0 | { | 988 | 0 | for( int i = -!!ctuPosX; i <= checkRight; i++ ) | 989 | 0 | if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE ) | 990 | 0 | return false; | 991 | 0 | } | 992 | 0 | } | 993 | | | 994 | | // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too) | 995 | | // check right with max offset (due to WPP condition above, this implies top-right has been already encoded) | 996 | 26.4M | for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ ) | 997 | 17.5M | if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE ) | 998 | 8.56M | return false; | 999 | | | 1000 | | // check bottom right with 1 CTU delay (this is only required for intra pred) | 1001 | | // at the right picture border this will check the bottom CTU | 1002 | 8.95M | const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX ); | 1003 | 8.95M | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) | 1004 | 8.95M | return false; | 1005 | | | 1006 | 3.97k | if( checkReadyState ) | 1007 | 4.04k | return true; | 1008 | | | 1009 | 18.4E | ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer ); | 1010 | | | 1011 | | // reshape | 1012 | 18.4E | if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled ) | 1013 | 0 | { | 1014 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L ); | 1015 | 0 | PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height ); | 1016 | 0 | reco.rspSignal( pic->reshapeData.getInvLUT() ); | 1017 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1018 | 0 | } | 1019 | | | 1020 | | // loopfilter | 1021 | 18.4E | if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled ) | 1022 | 0 | { | 1023 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L ); | 1024 | | // calculate filter strengths | 1025 | 0 | encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true ); | 1026 | | | 1027 | | // vertical filter | 1028 | 0 | PelUnitBuf reco = cs.picture->getRecoBuf(); | 1029 | 0 | encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco ); | 1030 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1031 | 0 | } | 1032 | | | 1033 | 18.4E | ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer ); | 1034 | | | 1035 | 18.4E | processStates[ ctuRsAddr ] = LF_HOR; | 1036 | 18.4E | } | 1037 | 0 | break; | 1038 | | | 1039 | | // horizontal loopfilter | 1040 | 503k | case LF_HOR: | 1041 | 503k | { | 1042 | | // ensure horizontal ordering (from top to bottom) | 1043 | 503k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) ) | 1044 | 168k | return false; | 1045 | | | 1046 | | // ensure vertical loop filter of neighbor ctu's will not modify current residual | 1047 | | // check top, top-right and right ctu | 1048 | | // (top, top-right checked implicitly due to ordering check above) | 1049 | 335k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) | 1050 | 331k | return false; | 1051 | | | 1052 | 4.00k | if( checkReadyState ) | 1053 | 4.05k | return true; | 1054 | | | 1055 | 18.4E | ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor ); | 1056 | | | 1057 | 18.4E | if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled ) | 1058 | 0 | { | 1059 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L ); | 1060 | 0 | PelUnitBuf reco = cs.picture->getRecoBuf(); | 1061 | 0 | encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco ); | 1062 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1063 | 0 | } | 1064 | | | 1065 | 18.4E | ITT_TASKEND( itt_domain_encode, itt_handle_lfHor ); | 1066 | | | 1067 | 18.4E | processStates[ ctuRsAddr ] = SAO_FILTER; | 1068 | 18.4E | } | 1069 | 0 | break; | 1070 | | | 1071 | | // SAO filter | 1072 | 314k | case SAO_FILTER: | 1073 | 314k | { | 1074 | | // general wpp conditions, top and top-right ctu have to be filtered | 1075 | 314k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false; | 1076 | 255k | if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false; | 1077 | | | 1078 | | // ensure loop filter of neighbor ctu's will not modify current residual | 1079 | | // sao processing dependents on +1 pixel to each side | 1080 | | // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked | 1081 | 230k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, true ) ) return false; | 1082 | 213k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, true ) ) return false; | 1083 | 7.27k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false; | 1084 | | | 1085 | 4.03k | if( checkReadyState ) | 1086 | 4.05k | return true; | 1087 | | | 1088 | 18.4E | ITT_TASKSTART( itt_domain_encode, itt_handle_sao ); | 1089 | | | 1090 | | // SAO filter | 1091 | 18.4E | if( slice.sps->saoEnabled && pic->useSAO ) | 1092 | 0 | { | 1093 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L ); | 1094 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 1095 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 1096 | 0 | EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao; | 1097 | |
| 1098 | 0 | encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache ); | 1099 | 0 | encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY ); | 1100 | 0 | encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr ); | 1101 | 0 | encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() ); | 1102 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1103 | 0 | } | 1104 | | | 1105 | | // ALF border extension | 1106 | 18.4E | if( cs.sps->alfEnabled ) | 1107 | 0 | { | 1108 | | // we have to do some kind of position aware boundary padding | 1109 | | // it's done here because the conditions are readable | 1110 | 0 | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); | 1111 | 0 | const int fltSize = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1; | 1112 | 0 | const int xL = ( ctuPosX == 0 ) ? ( x-fltSize ) : ( x ); | 1113 | 0 | const int xR = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width ); | 1114 | |
| 1115 | 0 | if( ctuPosX == 0 ) recoBuf.extendBorderPelLft( y, height, fltSize ); | 1116 | 0 | if( ctuPosX+1 == pcv.widthInCtus ) recoBuf.extendBorderPelRgt( y, height, fltSize ); | 1117 | 0 | if( ctuPosY == 0 ) recoBuf.extendBorderPelTop( xL, xR-xL, fltSize ); | 1118 | 0 | if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize ); | 1119 | |
| 1120 | 0 | encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY); | 1121 | 0 | } | 1122 | | | 1123 | | // DMVR refinement can be stored now | 1124 | 18.4E | if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag ) | 1125 | 0 | { | 1126 | 0 | CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY ); | 1127 | 0 | } | 1128 | 18.4E | ITT_TASKEND( itt_domain_encode, itt_handle_sao ); | 1129 | | | 1130 | 18.4E | const int tileCol = slice.pps->ctuToTileCol[ctuPosX]; | 1131 | 18.4E | const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 1132 | 18.4E | if( ctuPosX == lastCtuColInTileRow ) | 1133 | 0 | { | 1134 | 0 | processStates[ctuRsAddr] = ALF_GET_STATISTICS; | 1135 | 0 | } | 1136 | 18.4E | else | 1137 | 18.4E | { | 1138 | 18.4E | processStates[ctuRsAddr] = PROCESS_DONE; | 1139 | 18.4E | return true; | 1140 | 18.4E | } | 1141 | 18.4E | } | 1142 | 0 | break; | 1143 | | | 1144 | 88.4k | case ALF_GET_STATISTICS: | 1145 | 88.4k | { | 1146 | | // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's) | 1147 | | // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked | 1148 | 88.4k | if( checkCtuTaskNbRgt ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; | 1149 | 88.4k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; | 1150 | 2.26k | if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false; | 1151 | | | 1152 | 2.26k | if( checkReadyState ) | 1153 | 2.26k | return true; | 1154 | | | 1155 | 0 | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat ); | 1156 | | | 1157 | | // ALF pre-processing | 1158 | 0 | if( slice.sps->alfEnabled ) | 1159 | 0 | { | 1160 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); | 1161 | 0 | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); | 1162 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1163 | 0 | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1164 | 0 | { | 1165 | 0 | encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1166 | 0 | } | 1167 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1168 | 0 | } | 1169 | |
| 1170 | 0 | ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat ); | 1171 | | | 1172 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1173 | 0 | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; | 1174 | 0 | processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER; | 1175 | 0 | } | 1176 | 0 | break; | 1177 | | | 1178 | 1.98M | case ALF_DERIVE_FILTER: | 1179 | 1.98M | { | 1180 | 1.98M | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; | 1181 | 1.98M | if( ctuRsAddr == deriveFilterCtu ) | 1182 | 1.98M | { | 1183 | | // ensure statistics from all previous ctu's have been collected | 1184 | 1.98M | int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1; | 1185 | 2.00M | for( int y = 0; y < numCheckLines; y++ ) | 1186 | 2.00M | { | 1187 | 2.01M | for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) | 1188 | 2.00M | { | 1189 | 2.00M | const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 1190 | 2.00M | if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS ) | 1191 | 1.98M | return false; | 1192 | 2.00M | } | 1193 | 2.00M | } | 1194 | 1.98M | } | 1195 | 0 | else if( syncLines ) | 1196 | 0 | { | 1197 | | // ALF bitstream coding dependency for the sub-sequent ctu-lines | 1198 | 0 | if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) | 1199 | 0 | return false; | 1200 | 0 | } | 1201 | 1.29k | if( checkReadyState ) | 1202 | 1.29k | return true; | 1203 | | | 1204 | 0 | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive ); | 1205 | | // ALF post-processing | 1206 | 0 | if( slice.sps->alfEnabled ) | 1207 | 0 | { | 1208 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); | 1209 | 0 | if( ctuRsAddr == deriveFilterCtu ) | 1210 | 0 | { | 1211 | 0 | encSlice->m_pALF->initDerivation( slice ); | 1212 | 0 | encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 ); | 1213 | 0 | encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false ); | 1214 | 0 | } | 1215 | 0 | else if( syncLines ) | 1216 | 0 | { | 1217 | | // in sync lines mode: derive/select filter for the remaining lines | 1218 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 1219 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 1220 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1221 | 0 | for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++) | 1222 | 0 | { | 1223 | 0 | encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu ); | 1224 | 0 | } | 1225 | 0 | } | 1226 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1227 | 0 | } | 1228 | |
| 1229 | 0 | ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive ); | 1230 | 0 | processStates[ ctuRsAddr ] = ALF_RECONSTRUCT; | 1231 | 0 | } | 1232 | 0 | break; | 1233 | | | 1234 | 10.0M | case ALF_RECONSTRUCT: | 1235 | 10.0M | { | 1236 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1237 | 10.0M | const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu; | 1238 | 10.0M | if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT ) | 1239 | 10.0M | return false; | 1240 | 2.26k | else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 ) | 1241 | 0 | { | 1242 | 0 | const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus(); | 1243 | 0 | const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 ); | 1244 | 0 | if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) | 1245 | 0 | return false; | 1246 | 0 | } | 1247 | | | 1248 | 2.26k | if( checkReadyState ) | 1249 | 2.26k | return true; | 1250 | | | 1251 | 18.4E | ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon ); | 1252 | | | 1253 | 18.4E | if( slice.sps->alfEnabled ) | 1254 | 0 | { | 1255 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L ); | 1256 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1257 | 0 | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1258 | 0 | { | 1259 | 0 | encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1260 | 0 | } | 1261 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1262 | 0 | } | 1263 | | | 1264 | 18.4E | ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon ); | 1265 | 18.4E | processStates[ctuRsAddr] = CCALF_GET_STATISTICS; | 1266 | 18.4E | } | 1267 | | // dont break, no additional deps, can continue straigt away! | 1268 | | //break; | 1269 | | | 1270 | 5.73k | case CCALF_GET_STATISTICS: | 1271 | 5.73k | { | 1272 | 5.73k | if( checkCtuTaskNbTop ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false; | 1273 | 2.45k | if( checkCtuTaskNbBot ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false; | 1274 | | | 1275 | 823 | if( checkReadyState ) | 1276 | 824 | return true; | 1277 | | | 1278 | 18.4E | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat ); | 1279 | | | 1280 | | // ALF pre-processing | 1281 | 18.4E | if( slice.sps->ccalfEnabled ) | 1282 | 0 | { | 1283 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L); | 1284 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1285 | 0 | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1286 | 0 | { | 1287 | 0 | encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1288 | 0 | encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1289 | 0 | } | 1290 | 0 | PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L ); | 1291 | 0 | } | 1292 | | | 1293 | 18.4E | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat ); | 1294 | | | 1295 | | // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1296 | 18.4E | processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER; | 1297 | 18.4E | } | 1298 | 0 | break; | 1299 | | | 1300 | 208k | case CCALF_DERIVE_FILTER: | 1301 | 208k | { | 1302 | | // synchronization dependencies | 1303 | 208k | const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu; | 1304 | 208k | if( ctuRsAddr == deriveFilterCtu ) | 1305 | 208k | { | 1306 | | // ensure statistics from all previous ctu's have been collected | 1307 | 208k | int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1; | 1308 | 215k | for( int y = 0; y < numCheckLines; y++ ) | 1309 | 214k | { | 1310 | 220k | for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ ) | 1311 | 214k | { | 1312 | 214k | const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1; | 1313 | 214k | if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS ) | 1314 | 207k | return false; | 1315 | 214k | } | 1316 | 214k | } | 1317 | 208k | } | 1318 | 0 | else if( syncLines ) | 1319 | 0 | { | 1320 | | // ALF bitstream coding dependency for the sub-sequent CTU-lines | 1321 | 0 | if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) | 1322 | 0 | return false; | 1323 | 0 | } | 1324 | 1.29k | if( checkReadyState ) | 1325 | 1.29k | return true; | 1326 | | | 1327 | 0 | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive ); | 1328 | | | 1329 | | // start task | 1330 | 0 | if( slice.sps->ccalfEnabled ) | 1331 | 0 | { | 1332 | 0 | if( ctuRsAddr == deriveFilterCtu ) | 1333 | 0 | { | 1334 | 0 | encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 ); | 1335 | 0 | } | 1336 | 0 | else if( syncLines ) | 1337 | 0 | { | 1338 | | // in sync lines mode: derive/select filter for the remaining lines | 1339 | 0 | TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ]; | 1340 | 0 | PerThreadRsrc* taskRsrc = encSlice->m_ThreadRsrc[ threadIdx ]; | 1341 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1342 | 0 | encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr ); | 1343 | 0 | encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr ); | 1344 | 0 | } | 1345 | 0 | } | 1346 | 0 | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive ); | 1347 | |
| 1348 | 0 | processStates[ctuRsAddr] = CCALF_RECONSTRUCT; | 1349 | 0 | } | 1350 | 0 | break; | 1351 | | | 1352 | 11.3k | case CCALF_RECONSTRUCT: | 1353 | 11.3k | { | 1354 | | // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode) | 1355 | 11.3k | const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu; | 1356 | 11.3k | if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT ) | 1357 | 9.05k | return false; | 1358 | | | 1359 | 2.26k | if( syncLines ) | 1360 | 0 | { | 1361 | | // ensure line-by-line reconstruction due to line synchronization | 1362 | 0 | if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false; | 1363 | | // check bottom due to rec. buffer usage in ccalf statistics | 1364 | 0 | if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false; | 1365 | 0 | } | 1366 | | | 1367 | 2.26k | if( checkReadyState ) | 1368 | 2.26k | return true; | 1369 | | | 1370 | 0 | ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon ); | 1371 | |
| 1372 | 0 | if( slice.sps->ccalfEnabled ) | 1373 | 0 | { | 1374 | 0 | const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]]; | 1375 | 0 | for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ ) | 1376 | 0 | { | 1377 | 0 | encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1378 | 0 | encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf ); | 1379 | 0 | } | 1380 | 0 | } | 1381 | |
| 1382 | 0 | ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon ); | 1383 | | | 1384 | | // extend pic border | 1385 | | // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done | 1386 | 0 | if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols ) | 1387 | 0 | { | 1388 | 0 | PelUnitBuf recoBuf = cs.picture->getRecoBuf(); | 1389 | 0 | const int margin = cs.picture->margin; | 1390 | 0 | recoBuf.extendBorderPelLft( y, height, margin ); | 1391 | 0 | recoBuf.extendBorderPelRgt( y, height, margin ); | 1392 | 0 | if(ctuPosY == 0) | 1393 | 0 | recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin ); | 1394 | 0 | if(ctuPosY + 1 == pcv.heightInCtus) | 1395 | 0 | recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin ); | 1396 | | | 1397 | | // for IFP lines synchro, do an additional increment signaling that CTU row is ready | 1398 | 0 | if( syncLines ) | 1399 | 0 | ++(pic->m_tileColsDone->at( ctuPosY )); | 1400 | 0 | } | 1401 | | | 1402 | | // perform finish only once for whole picture | 1403 | 0 | const unsigned finishCtu = pcv.sizeInCtus - 1; | 1404 | 0 | if( ctuRsAddr < finishCtu ) | 1405 | 0 | { | 1406 | 0 | processStates[ctuRsAddr] = PROCESS_DONE; | 1407 | | // processing done => terminate thread | 1408 | 0 | return true; | 1409 | 0 | } | 1410 | 0 | processStates[ctuRsAddr] = FINISH_SLICE; | 1411 | 0 | } | 1412 | | | 1413 | 17.9k | case FINISH_SLICE: | 1414 | 17.9k | { | 1415 | 17.9k | CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" ); | 1416 | | | 1417 | | // ensure all coding tasks have been done for all previous ctu's | 1418 | 37.9k | for( int i = 0; i < ctuRsAddr; i++ ) | 1419 | 37.3k | if( processStates[ i ] < FINISH_SLICE ) | 1420 | 17.2k | return false; | 1421 | | | 1422 | 647 | if( checkReadyState ) | 1423 | 647 | return true; | 1424 | | | 1425 | 0 | encSlice->finishCompressSlice( cs.picture, slice ); | 1426 | |
| 1427 | 0 | processStates[ ctuRsAddr ] = PROCESS_DONE; | 1428 | | // processing done => terminate thread | 1429 | 0 | return true; | 1430 | 647 | } | 1431 | | | 1432 | 0 | case PROCESS_DONE: | 1433 | 0 | CHECK( true, "process state is PROCESS_DONE, but thread is still running" ); | 1434 | 0 | return true; | 1435 | | | 1436 | 0 | default: | 1437 | 0 | CHECK( true, "unknown process state" ); | 1438 | 0 | return true; | 1439 | 59.2M | } | 1440 | | | 1441 | 0 | return false; | 1442 | 59.2M | } |
|
1443 | | |
1444 | | void EncSlice::encodeSliceData( Picture* pic ) |
1445 | 1.29k | { |
1446 | 1.29k | CodingStructure& cs = *pic->cs; |
1447 | 1.29k | Slice* const slice = cs.slice; |
1448 | 1.29k | const uint32_t startCtuTsAddr = slice->sliceMap.ctuAddrInSlice[0]; |
1449 | 1.29k | const uint32_t boundingCtuTsAddr = cs.pcv->sizeInCtus; |
1450 | 1.29k | const bool wavefrontsEnabled = slice->sps->entropyCodingSyncEnabled; |
1451 | | |
1452 | | // this ensures that independently encoded bitstream chunks can be combined to bit-equal |
1453 | 1.29k | const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx; |
1454 | 1.29k | slice->encCABACTableIdx = cabacTableIdx; |
1455 | | |
1456 | | // initialise entropy coder for the slice |
1457 | 1.29k | m_CABACWriter.initCtxModels( *slice ); |
1458 | | |
1459 | 1.29k | DTRACE( g_trace_ctx, D_HEADER, "=========== POC: %d ===========\n", slice->poc ); |
1460 | | |
1461 | 1.29k | int prevQP[MAX_NUM_CH]; |
1462 | 1.29k | prevQP[0] = prevQP[1] = slice->sliceQp; |
1463 | | |
1464 | 1.29k | const PreCalcValues& pcv = *cs.pcv; |
1465 | 1.29k | const uint32_t widthInCtus = pcv.widthInCtus; |
1466 | 1.29k | uint32_t uiSubStrm = 0; |
1467 | 1.29k | const int numSubstreamsColumns = slice->pps->numTileCols; |
1468 | 1.29k | const int numSubstreamRows = slice->sps->entropyCodingSyncEnabled ? pic->cs->pcv->heightInCtus : slice->pps->numTileRows; |
1469 | 1.29k | const int numSubstreams = std::max<int>( numSubstreamRows * numSubstreamsColumns, 0/*(int)pic->brickMap->bricks.size()*/ ); |
1470 | 1.29k | std::vector<OutputBitstream> substreamsOut( numSubstreams ); |
1471 | | |
1472 | 1.29k | slice->clearSubstreamSizes(); |
1473 | | |
1474 | 5.34k | for( uint32_t ctuTsAddr = startCtuTsAddr; ctuTsAddr < boundingCtuTsAddr; ctuTsAddr++ ) |
1475 | 4.05k | { |
1476 | 4.05k | const uint32_t ctuRsAddr = slice->sliceMap.ctuAddrInSlice[ctuTsAddr]; |
1477 | 4.05k | const uint32_t ctuXPosInCtus = ctuRsAddr % widthInCtus; |
1478 | 4.05k | const uint32_t ctuYPosInCtus = ctuRsAddr / widthInCtus; |
1479 | 4.05k | const uint32_t tileXPosInCtus = slice->pps->tileColBd[cs.pps->ctuToTileCol[ctuXPosInCtus]]; |
1480 | 4.05k | const uint32_t tileYPosInCtus = slice->pps->tileRowBd[cs.pps->ctuToTileRow[ctuYPosInCtus]]; |
1481 | | |
1482 | 4.05k | DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) ); |
1483 | | |
1484 | 4.05k | const Position pos (ctuXPosInCtus * pcv.maxCUSize, ctuYPosInCtus * pcv.maxCUSize); |
1485 | 4.05k | const UnitArea ctuArea (cs.area.chromaFormat, Area(pos.x, pos.y, pcv.maxCUSize, pcv.maxCUSize)); |
1486 | 4.05k | CHECK( uiSubStrm >= numSubstreams, "array index out of bounds" ); |
1487 | 4.05k | m_CABACWriter.initBitstream( &substreamsOut[ uiSubStrm ] ); |
1488 | | |
1489 | | // set up CABAC contexts' state for this CTU |
1490 | 4.05k | if (ctuXPosInCtus == tileXPosInCtus && ctuYPosInCtus == tileYPosInCtus ) |
1491 | 1.29k | { |
1492 | 1.29k | if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset |
1493 | 0 | { |
1494 | 0 | m_CABACWriter.initCtxModels( *slice ); |
1495 | 0 | } |
1496 | 1.29k | prevQP[0] = prevQP[1] = slice->sliceQp; |
1497 | 1.29k | } |
1498 | 2.75k | else if (ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled) |
1499 | 0 | { |
1500 | | // Synchronize cabac probabilities with upper-right CTU if it's available and at the start of a line. |
1501 | 0 | if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset |
1502 | 0 | { |
1503 | 0 | m_CABACWriter.initCtxModels( *slice ); |
1504 | 0 | } |
1505 | 0 | if( cs.getCURestricted( pos.offset( 0, -1 ), pos, slice->independentSliceIdx, slice->pps->getTileIdx( ctuXPosInCtus, ctuYPosInCtus ), CH_L, TREE_D ) ) |
1506 | 0 | { |
1507 | | // Top-right is available, so use it. |
1508 | 0 | m_CABACWriter.getCtx() = m_entropyCodingSyncContextState; |
1509 | 0 | } |
1510 | 0 | prevQP[0] = prevQP[1] = slice->sliceQp; |
1511 | 0 | } |
1512 | | |
1513 | 4.05k | m_CABACWriter.coding_tree_unit( cs, ctuArea, prevQP, ctuRsAddr ); |
1514 | | |
1515 | | // store probabilities of second CTU in line into buffer |
1516 | 4.05k | if( ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled ) |
1517 | 0 | { |
1518 | 0 | m_entropyCodingSyncContextState = m_CABACWriter.getCtx(); |
1519 | 0 | } |
1520 | | |
1521 | | // terminate the sub-stream, if required (end of slice-segment, end of tile, end of wavefront-CTU-row): |
1522 | 4.05k | bool isMoreCTUsinSlice = ctuTsAddr != (boundingCtuTsAddr - 1); |
1523 | 4.05k | bool isLastCTUinTile = isMoreCTUsinSlice && slice->pps->getTileIdx( ctuRsAddr ) != slice->pps->getTileIdx( slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] ); |
1524 | 4.05k | bool isLastCTUinWPP = wavefrontsEnabled && isMoreCTUsinSlice && !isLastCTUinTile && ( (slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus) == cs.pps->tileColBd[cs.pps->ctuToTileCol[slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus]] ); //TODO: adjust tile bound condition |
1525 | | |
1526 | 4.05k | if (isLastCTUinWPP || !isMoreCTUsinSlice || isLastCTUinTile ) // this the the last CTU of either tile/brick/WPP/slice |
1527 | 1.29k | { |
1528 | 1.29k | m_CABACWriter.end_of_slice(); |
1529 | | |
1530 | | // Byte-alignment in slice_data() when new tile |
1531 | 1.29k | substreamsOut[ uiSubStrm ].writeByteAlignment(); |
1532 | | |
1533 | 1.29k | if (isMoreCTUsinSlice) //Byte alignment only when it is not the last substream in the slice |
1534 | 0 | { |
1535 | | // write sub-stream size |
1536 | 0 | slice->addSubstreamSize( ( substreamsOut[ uiSubStrm ].getNumberOfWrittenBits() >> 3 ) + substreamsOut[ uiSubStrm ].countStartCodeEmulations() ); |
1537 | 0 | } |
1538 | 1.29k | uiSubStrm++; |
1539 | 1.29k | } |
1540 | 4.05k | } // CTU-loop |
1541 | | |
1542 | 1.29k | if(slice->pps->cabacInitPresent) |
1543 | 0 | { |
1544 | 0 | m_encCABACTableIdx = m_CABACWriter.getCtxInitId( *slice ); |
1545 | 0 | } |
1546 | 1.29k | else |
1547 | 1.29k | { |
1548 | 1.29k | m_encCABACTableIdx = slice->sliceType; |
1549 | 1.29k | } |
1550 | | |
1551 | | // concatenate substreams |
1552 | 1.29k | OutputBitstream& outStream = pic->sliceDataStreams[ 0/*slice->sliceIdx*/ ]; |
1553 | 2.59k | for ( int i = 0; i < slice->getNumberOfSubstreamSizes() + 1; i++ ) |
1554 | 1.29k | { |
1555 | 1.29k | outStream.addSubstream( &(substreamsOut[ i ]) ); |
1556 | 1.29k | } |
1557 | 1.29k | pic->sliceDataNumBins += m_CABACWriter.getNumBins(); |
1558 | 1.29k | } |
1559 | | |
1560 | | } // namespace vvenc |
1561 | | |
1562 | | //! \} |
1563 | | |