Coverage Report

Created: 2026-06-10 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/EncoderLib/EncSlice.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     EncSlice.cpp
45
    \brief    slice encoder class
46
*/
47
48
#include "EncSlice.h"
49
#include "EncStage.h"
50
#include "EncLib.h"
51
#include "EncPicture.h"
52
#include "BitAllocation.h"
53
#include "CommonLib/UnitTools.h"
54
#include "CommonLib/Picture.h"
55
#include "CommonLib/TimeProfiler.h"
56
#include "CommonLib/dtrace_codingstruct.h"
57
#include "Utilities/NoMallocThreadPool.h"
58
59
#include <math.h>
60
#include "vvenc/vvencCfg.h"
61
62
//! \ingroup EncoderLib
63
//! \{
64
65
namespace vvenc {
66
67
#ifdef TRACE_ENABLE_ITT
68
static const __itt_domain* itt_domain_encode              = __itt_domain_create( "Encode" );
69
static const __itt_string_handle* itt_handle_ctuEncode    = __itt_string_handle_create( "Encode_CTU" );
70
static const __itt_string_handle* itt_handle_rspLfVer     = __itt_string_handle_create( "RspLfVer_CTU" );
71
static const __itt_string_handle* itt_handle_lfHor        = __itt_string_handle_create( "LfHor_CTU" );
72
static const __itt_string_handle* itt_handle_sao          = __itt_string_handle_create( "SAO_CTU" );
73
static const __itt_string_handle* itt_handle_alf_stat     = __itt_string_handle_create( "ALF_CTU_STAT" );
74
static const __itt_string_handle* itt_handle_alf_derive   = __itt_string_handle_create( "ALF_DERIVE" );
75
static const __itt_string_handle* itt_handle_alf_recon    = __itt_string_handle_create( "ALF_RECONSTRUCT" );
76
static const __itt_string_handle* itt_handle_ccalf_stat   = __itt_string_handle_create( "CCALF_CTU_STAT" );
77
static const __itt_string_handle* itt_handle_ccalf_derive = __itt_string_handle_create( "CCALF_DERIVE" );
78
static const __itt_string_handle* itt_handle_ccalf_recon  = __itt_string_handle_create( "CCALF_RECONSTRUCT" );
79
#endif
80
81
void setArbitraryWppPattern( const PreCalcValues& pcv, std::vector<int>& ctuAddrMap, int stepX = 1 )
82
4.44k
{
83
4.44k
  ctuAddrMap.resize( pcv.sizeInCtus, 0 );
84
4.44k
  std::vector<int> x_in_line( pcv.heightInCtus, 0 );
85
4.44k
  int x = 0, y = 0, addr = 0;
86
4.44k
  int y_top = 0;
87
4.44k
  const int step = stepX; // number of CTUs in x-direction to scan 
88
4.44k
  ctuAddrMap[addr++] = x++; // first entry (can be omitted)
89
10.4k
  while( addr < pcv.sizeInCtus )
90
6.76k
  {
91
    // fill entries in x-direction
92
6.76k
    int x1 = x;
93
16.1k
    while( x < std::min(x1 + step, (int)pcv.widthInCtus) )
94
9.41k
    {
95
      // general WPP condition (top-right CTU availability)
96
9.41k
      if( y > 0 && !( x_in_line[y - 1] - x >= 2 ) && x != pcv.widthInCtus - 1 )
97
0
        break;
98
9.41k
      ctuAddrMap[addr++] = y*pcv.widthInCtus + x;
99
9.41k
      x++;
100
9.41k
    }
101
6.76k
    x_in_line[y] = x;
102
        
103
6.76k
    y += 1;
104
105
6.76k
    if( y >= pcv.heightInCtus )
106
3.38k
    {
107
      // go up
108
3.38k
      if( x_in_line[y_top] >= pcv.widthInCtus )
109
3.38k
      {
110
3.38k
        y_top++;
111
3.38k
        if( y_top >= pcv.heightInCtus )
112
780
        {
113
          // done
114
780
          break;
115
780
        }
116
3.38k
      }
117
2.60k
      y = y_top;
118
2.60k
    }
119
5.98k
    x = x_in_line[y];
120
121
5.98k
    CHECK( y >= pcv.heightInCtus, "Height in CTUs is exceeded" );
122
5.98k
  }
123
4.44k
}
124
125
struct TileLineEncRsrc
126
{
127
  BitEstimator            m_BitEstimator;
128
  CABACWriter             m_CABACEstimator;
129
  BitEstimator            m_SaoBitEstimator;
130
  CABACWriter             m_SaoCABACEstimator;
131
  BitEstimator            m_AlfBitEstimator;
132
  CABACWriter             m_AlfCABACEstimator;
133
  ReuseUniMv              m_ReuseUniMv;
134
  BlkUniMvInfoBuffer      m_BlkUniMvInfoBuffer;
135
  AffineProfList          m_AffineProfList;
136
  IbcBvCand               m_CachedBvs;
137
  EncSampleAdaptiveOffset m_encSao;
138
  int                     m_prevQp[ MAX_NUM_CH ];
139
7.81k
  TileLineEncRsrc( const VVEncCfg& encCfg ) : m_CABACEstimator( m_BitEstimator ), m_SaoCABACEstimator( m_SaoBitEstimator ), m_AlfCABACEstimator( m_AlfBitEstimator ) { m_AffineProfList.init( ! encCfg.m_picReordering ); }
140
};
141
142
struct PerThreadRsrc
143
{
144
  CtxCache  m_CtxCache;
145
  EncCu     m_encCu;
146
  PelStorage m_alfTempCtuBuf;
147
};
148
149
struct CtuEncParam
150
{
151
  Picture*  pic;
152
  EncSlice* encSlice;
153
  int       ctuRsAddr;
154
  int       ctuPosX;
155
  int       ctuPosY;
156
  UnitArea  ctuArea;
157
  int       tileLineResIdx;
158
159
13.8k
  CtuEncParam() : pic( nullptr ), encSlice( nullptr ), ctuRsAddr( 0 ), ctuPosX( 0 ), ctuPosY( 0 ), ctuArea(), tileLineResIdx( 0 ) {}
160
  CtuEncParam( Picture* _p, EncSlice* _s, const int _r, const int _x, const int _y, const int _tileLineResIdx )
161
    : pic( _p )
162
    , encSlice( _s )
163
    , ctuRsAddr( _r )
164
    , ctuPosX( _x )
165
    , ctuPosY( _y )
166
    , ctuArea( pic->chromaFormat, pic->slices[0]->pps->pcv->getCtuArea( _x, _y ) )
167
0
    , tileLineResIdx( _tileLineResIdx ) {}
168
};
169
170
// ====================================================================================================================
171
// Constructor / destructor / create / destroy
172
// ====================================================================================================================
173
174
EncSlice::EncSlice()
175
4.44k
  : m_pcEncCfg           ( nullptr)
176
4.44k
  , m_threadPool         ( nullptr )
177
4.44k
  , m_ctuTasksDoneCounter( nullptr )
178
4.44k
  , m_ctuEncDelay        ( 1 )
179
4.44k
  , m_pLoopFilter        ( nullptr )
180
4.44k
  , m_pALF               ( nullptr )
181
4.44k
  , m_pcRateCtrl         ( nullptr )
182
4.44k
  , m_CABACWriter        ( m_BinEncoder )
183
4.44k
  , m_encCABACTableIdx   ( VVENC_I_SLICE )
184
4.44k
{
185
4.44k
}
186
187
188
EncSlice::~EncSlice()
189
4.44k
{
190
4.44k
  for( auto* lnRsc : m_TileLineEncRsrc )
191
7.81k
  {
192
7.81k
    delete lnRsc;
193
7.81k
  }
194
4.44k
  m_TileLineEncRsrc.clear();
195
196
4.44k
  for( auto* taskRsc: m_ThreadRsrc )
197
17.7k
  {
198
17.7k
    taskRsc->m_alfTempCtuBuf.destroy();
199
17.7k
    delete taskRsc;
200
17.7k
  }
201
4.44k
  m_ThreadRsrc.clear();
202
203
4.44k
  m_saoReconParams.clear();
204
205
18.3k
  for( int i = 0; i < m_saoStatData.size(); i++ )
206
13.8k
  {
207
55.4k
    for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ )
208
41.5k
    {
209
41.5k
      delete[] m_saoStatData[ i ][ compIdx ];
210
41.5k
    }
211
13.8k
    delete[] m_saoStatData[ i ];
212
13.8k
  }
213
4.44k
  m_saoStatData.clear();
214
4.44k
}
215
216
void EncSlice::init( const VVEncCfg& encCfg,
217
                     const SPS& sps,
218
                     const PPS& pps,
219
                     std::vector<int>* const globalCtuQpVector,
220
                     LoopFilter& loopFilter,
221
                     EncAdaptiveLoopFilter& alf,
222
                     RateCtrl& rateCtrl,
223
                     NoMallocThreadPool* threadPool,
224
                     WaitCounter* ctuTasksDoneCounter )
225
4.44k
{
226
4.44k
  m_pcEncCfg            = &encCfg;
227
4.44k
  m_pLoopFilter         = &loopFilter;
228
4.44k
  m_pALF                = &alf;
229
4.44k
  m_pcRateCtrl          = &rateCtrl;
230
4.44k
  m_threadPool          = threadPool;
231
4.44k
  m_ctuTasksDoneCounter = ctuTasksDoneCounter;
232
4.44k
  m_syncPicCtx.resize( encCfg.m_entropyCodingSyncEnabled ? pps.getNumTileLineIds() : 0 );
233
234
  
235
4.44k
  const int maxCntRscr = ( encCfg.m_numThreads > 0 ) ? pps.getNumTileLineIds() : 1;
236
4.44k
  const int maxCtuEnc  = ( encCfg.m_numThreads > 0 && threadPool ) ? threadPool->numThreads() : 1;
237
238
4.44k
  m_ThreadRsrc.resize( maxCtuEnc,  nullptr );
239
4.44k
  m_TileLineEncRsrc.resize( maxCntRscr, nullptr );
240
241
4.44k
  for( PerThreadRsrc*& taskRsc : m_ThreadRsrc )
242
17.7k
  {
243
17.7k
    taskRsc = new PerThreadRsrc();
244
17.7k
    taskRsc->m_encCu.init( encCfg,
245
17.7k
                           sps,
246
17.7k
                           globalCtuQpVector,
247
17.7k
                           m_syncPicCtx.data(),
248
17.7k
                           &rateCtrl );
249
17.7k
    taskRsc->m_alfTempCtuBuf.create( pps.pcv->chrFormat, Area( 0, 0, pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1), pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1) ), pps.pcv->maxCUSize, MAX_ALF_PADDING_SIZE, 0, false );
250
17.7k
  }
251
252
4.44k
  for( TileLineEncRsrc*& lnRsc : m_TileLineEncRsrc )
253
7.81k
  {
254
7.81k
    lnRsc = new TileLineEncRsrc( encCfg );
255
7.81k
    if( sps.saoEnabled )
256
7.81k
    {
257
7.81k
      lnRsc->m_encSao.init( encCfg );
258
7.81k
    }
259
7.81k
  }
260
261
4.44k
  const int sizeInCtus = pps.pcv->sizeInCtus;
262
4.44k
  m_processStates = std::vector<ProcessCtuState>( sizeInCtus );
263
4.44k
  m_saoReconParams.resize( sizeInCtus );
264
265
4.44k
  ::memset( m_saoDisabledRate, 0, sizeof( m_saoDisabledRate ) );
266
267
  // sao statistics
268
4.44k
  if( encCfg.m_bUseSAO )
269
4.44k
  {
270
4.44k
    m_saoStatData.resize( sizeInCtus );
271
18.3k
    for( int i = 0; i < sizeInCtus; i++ )
272
13.8k
    {
273
13.8k
      m_saoStatData[ i ] = new SAOStatData*[ MAX_NUM_COMP ];
274
55.4k
      for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ )
275
41.5k
      {
276
41.5k
        m_saoStatData[ i ][ compIdx ] = new SAOStatData[ NUM_SAO_NEW_TYPES ];
277
41.5k
      }
278
13.8k
    }
279
4.44k
  }
280
4.44k
  ctuEncParams.resize( sizeInCtus );
281
4.44k
  setArbitraryWppPattern( *pps.pcv, m_ctuAddrMap, 3 );
282
283
4.44k
  const unsigned asuHeightInCtus = m_pALF->getAsuHeightInCtus();
284
4.44k
  const unsigned numDeriveLines  = encCfg.m_ifpLines ? 
285
4.44k
    std::min( ((encCfg.m_ifpLines & (~(asuHeightInCtus - 1))) + asuHeightInCtus), pps.pcv->heightInCtus ) : pps.pcv->heightInCtus;
286
4.44k
  m_alfDeriveCtu  = numDeriveLines * pps.pcv->widthInCtus - 1;
287
4.44k
  m_ccalfDeriveCtu = encCfg.m_ifpLines ? pps.pcv->widthInCtus * std::min((unsigned)encCfg.m_ifpLines + 1, pps.pcv->heightInCtus) - 1: pps.pcv->sizeInCtus - 1;
288
4.44k
}
289
290
291
void EncSlice::initPic( Picture* pic )
292
1.11k
{
293
1.11k
  Slice* slice = pic->cs->slice;
294
295
1.11k
  if( slice->pps->numTileCols * slice->pps->numTileRows > 1 )
296
0
  {
297
0
    slice->sliceMap = slice->pps->sliceMap[0];
298
0
  }
299
1.11k
  else
300
1.11k
  {
301
1.11k
    slice->sliceMap.addCtusToSlice( 0, pic->cs->pcv->widthInCtus, 0, pic->cs->pcv->heightInCtus, pic->cs->pcv->widthInCtus);
302
1.11k
  }
303
304
  // this ensures that independently encoded bitstream chunks can be combined to bit-equal
305
1.11k
  const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx;
306
1.11k
  slice->encCABACTableIdx = cabacTableIdx;
307
308
  // set QP and lambda values
309
1.11k
  xInitSliceLambdaQP( slice );
310
311
1.11k
  for( auto* thrRsc : m_ThreadRsrc )
312
4.44k
  {
313
4.44k
    thrRsc->m_encCu.initPic( pic );
314
4.44k
  }
315
316
1.11k
  for( auto* lnRsc : m_TileLineEncRsrc )
317
1.95k
  {
318
1.95k
    lnRsc->m_ReuseUniMv.resetReusedUniMvs();
319
1.95k
  }
320
321
1.11k
  m_ctuEncDelay = 1;
322
1.11k
  if( pic->useIBC )
323
1.11k
  {
324
    // IBC needs unfiltered samples up to max IBC search range
325
    // therefore ensure that numCtuDelayLUT CTU's have been enocded first
326
    // assuming IBC localSearchRangeX / Y = 128
327
1.11k
    const int numCtuDelayLUT[ 3 ] = { 15, 3, 1 };
328
1.11k
    CHECK( pic->cs->pcv->maxCUSizeLog2 < 5 || pic->cs->pcv->maxCUSizeLog2 > 7, "invalid max CTUSize" );
329
1.11k
    m_ctuEncDelay = numCtuDelayLUT[ pic->cs->pcv->maxCUSizeLog2 - 5 ];
330
1.11k
  }
331
1.11k
}
332
333
334
335
void EncSlice::xInitSliceLambdaQP( Slice* slice )
336
1.11k
{
337
  // pre-compute lambda and QP
338
1.11k
  const bool rcp = (m_pcEncCfg->m_RCTargetBitrate > 0 && slice->pic->picInitialQP >= 0); // 2nd pass
339
1.11k
  int  iQP = Clip3 (-slice->sps->qpBDOffset[CH_L], MAX_QP, slice->pic->picInitialQP); // RC start QP
340
1.11k
  double dQP     = (rcp ? (double) slice->pic->picInitialQP : xGetQPForPicture (slice));
341
1.11k
  double dLambda = (rcp ? slice->pic->picInitialLambda : xCalculateLambda (slice, slice->TLayer, dQP, dQP, iQP));
342
1.11k
  int sliceChromaQpOffsetIntraOrPeriodic[2] = { m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[0], m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[1] };
343
1.11k
  const int lookAheadRCCQpOffset = 0;   // was (m_pcEncCfg->m_RCTargetBitrate > 0 && m_pcEncCfg->m_LookAhead && CS::isDualITree (*slice->pic->cs) ? 1 : 0);
344
1.11k
  int cbQP = 0, crQP = 0, cbCrQP = 0;
345
346
1.11k
  if (m_pcEncCfg->m_usePerceptQPA) // adapt sliceChromaQpOffsetIntraOrPeriodic and pic->ctuAdaptedQP
347
1.11k
  {
348
1.11k
    const bool cqp = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0);
349
1.11k
    const uint32_t startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
350
1.11k
    const uint32_t boundingCtuTsAddr = slice->pic->cs->pcv->sizeInCtus;
351
352
1.11k
    if ((iQP = BitAllocation::applyQPAdaptationSlice (slice, m_pcEncCfg, iQP, dLambda, &slice->pic->picVA.visAct, // updates pic->picInitialQP
353
1.11k
                                                      *m_ThreadRsrc[0]->m_encCu.getQpPtr(), m_pcRateCtrl->getIntraPQPAStats(),
354
1.11k
                                                      (slice->pps->sliceChromaQpFlag && cqp ? sliceChromaQpOffsetIntraOrPeriodic : nullptr),
355
1.11k
                                                      m_pcRateCtrl->getMinNoiseLevels(), startCtuTsAddr, boundingCtuTsAddr)) >= 0) // QP OK?
356
1.11k
    {
357
1.11k
      dLambda *= pow (2.0, ((double) iQP - dQP) / 3.0); // adjust lambda based on change of slice QP
358
1.11k
    }
359
0
    else iQP = (int) dQP; // revert to unadapted slice QP
360
1.11k
  }
361
0
  else if (rcp)
362
0
  {
363
0
    slice->pic->picInitialQP = -1; // no QPA - unused now
364
0
  }
365
366
1.11k
  if (slice->pps->sliceChromaQpFlag && CS::isDualITree (*slice->pic->cs) && !m_pcEncCfg->m_usePerceptQPA && (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity == 0))
367
0
  {
368
0
    cbQP = m_pcEncCfg->m_chromaCbQpOffsetDualTree + lookAheadRCCQpOffset; // QP offset for dual-tree
369
0
    crQP = m_pcEncCfg->m_chromaCrQpOffsetDualTree + lookAheadRCCQpOffset;
370
0
    cbCrQP = m_pcEncCfg->m_chromaCbCrQpOffsetDualTree + lookAheadRCCQpOffset;
371
0
  }
372
1.11k
  else if (slice->pps->sliceChromaQpFlag)
373
1.11k
  {
374
1.11k
    const GOPEntry &gopEntry             = *(slice->pic->gopEntry);
375
1.11k
    const bool bUseIntraOrPeriodicOffset = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0);
376
377
1.11k
    cbQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[0] : gopEntry.m_CbQPoffset) + lookAheadRCCQpOffset;
378
1.11k
    crQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[1] : gopEntry.m_CrQPoffset) + lookAheadRCCQpOffset;
379
1.11k
    cbCrQP = (cbQP + crQP) >> 1; // use floor of average CbCr chroma QP offset for joint-CbCr coding
380
381
1.11k
    cbQP = Clip3 (-12, 12, cbQP + slice->pps->chromaQpOffset[COMP_Cb]) - slice->pps->chromaQpOffset[COMP_Cb];
382
1.11k
    crQP = Clip3 (-12, 12, crQP + slice->pps->chromaQpOffset[COMP_Cr]) - slice->pps->chromaQpOffset[COMP_Cr];
383
1.11k
    cbCrQP = Clip3 (-12, 12, cbCrQP + slice->pps->chromaQpOffset[COMP_JOINT_CbCr]) - slice->pps->chromaQpOffset[COMP_JOINT_CbCr];
384
1.11k
  }
385
386
1.11k
  slice->sliceChromaQpDelta[COMP_Cb] = Clip3 (-12, 12, cbQP);
387
1.11k
  slice->sliceChromaQpDelta[COMP_Cr] = Clip3 (-12, 12, crQP);
388
1.11k
  slice->sliceChromaQpDelta[COMP_JOINT_CbCr] = (slice->sps->jointCbCr ? Clip3 (-12, 12, cbCrQP) : 0);
389
390
1.11k
  for( auto& thrRsc : m_ThreadRsrc )
391
4.44k
  {
392
4.44k
    thrRsc->m_encCu.setUpLambda( *slice, dLambda, iQP, true, true );
393
4.44k
  }
394
395
1.11k
  slice->sliceQp            = iQP;
396
1.11k
  slice->chromaQpAdjEnabled = slice->pps->chromaQpOffsetListLen > 0;
397
1.11k
}
398
399
static const int highTL[6] = { -1, 0, 0, 2, 4, 5 };
400
401
int EncSlice::xGetQPForPicture( const Slice* slice )
402
1.11k
{
403
1.11k
  const int lumaQpBDOffset = slice->sps->qpBDOffset[ CH_L ];
404
1.11k
  int qp;
405
406
1.11k
  if ( m_pcEncCfg->m_costMode == VVENC_COST_LOSSLESS_CODING )
407
0
  {
408
0
    qp = LOSSLESS_AND_MIXED_LOSSLESS_RD_COST_TEST_QP;
409
0
  }
410
1.11k
  else
411
1.11k
  {
412
1.11k
    qp = m_pcEncCfg->m_QP + slice->pic->gopAdaptedQP;
413
414
1.11k
    if (m_pcEncCfg->m_usePerceptQPA)
415
1.11k
    {
416
1.11k
      const int tlayer = slice->pic->gopEntry->m_vtl;
417
418
1.11k
      qp = (slice->isIntra() ? std::min (qp, ((qp - std::min (3, floorLog2 (m_pcEncCfg->m_GOPSize) - 4/*TODO 3 with JVET-AC0149?*/)) * 15 + 3) >> 4) : highTL[tlayer] + ((qp * (16 + std::min (2, tlayer))) >> 4) + 0/*TODO +-1?*/);
419
1.11k
    }
420
0
    else if( slice->isIntra() )
421
0
    {
422
0
      qp += m_pcEncCfg->m_intraQPOffset;
423
0
    }
424
0
    else
425
0
    {
426
0
      if( qp != -lumaQpBDOffset )
427
0
      {
428
0
        const GOPEntry &gopEntry = *(slice->pic->gopEntry);
429
        // adjust QP according to the QP offset for the GOP entry.
430
0
        qp += gopEntry.m_QPOffset;
431
432
        // adjust QP according to QPOffsetModel for the GOP entry.
433
0
        double dqpOffset = qp * gopEntry.m_QPOffsetModelScale + gopEntry.m_QPOffsetModelOffset + 0.5;
434
0
        int qpOffset = (int)floor( Clip3<double>( 0.0, 3.0, dqpOffset ) );
435
0
        qp += qpOffset;
436
0
      }
437
0
    }
438
439
1.11k
    if( m_pcEncCfg->m_blockImportanceMapping && !slice->pic->m_picShared->m_ctuBimQpOffset.empty() )
440
0
    {
441
0
      qp += slice->pic->m_picShared->m_picAuxQpOffset;
442
0
    }
443
1.11k
  }
444
1.11k
  qp = Clip3( -lumaQpBDOffset, MAX_QP, qp );
445
1.11k
  return qp;
446
1.11k
}
447
448
449
double EncSlice::xCalculateLambda( const Slice* slice,
450
                                   const int    depth, // slice GOP hierarchical depth.
451
                                   const double refQP, // initial slice-level QP
452
                                   const double dQP,   // initial double-precision QP
453
                                         int&   iQP )  // returned integer QP.
454
1.11k
{
455
1.11k
  const GOPEntry &gopEntry = *(slice->pic->gopEntry);
456
1.11k
  const int SHIFT_QP       = 12;
457
1.11k
  const int temporalId     = gopEntry.m_temporalId;
458
1.11k
  std::vector<double> intraLambdaModifiers;
459
1.11k
  for ( int i = 0; i < VVENC_MAX_TLAYER; i++ )
460
1.11k
  {
461
1.11k
    if( m_pcEncCfg->m_adIntraLambdaModifier[i] != 0.0 ) intraLambdaModifiers.push_back( m_pcEncCfg->m_adIntraLambdaModifier[i] );
462
1.11k
    else break;
463
1.11k
  }
464
465
1.11k
  int bitdepth_luma_qp_scale = 6
466
1.11k
                               * (slice->sps->bitDepths[ CH_L ] - 8
467
1.11k
                                  - DISTORTION_PRECISION_ADJUSTMENT(slice->sps->bitDepths[ CH_L ]));
468
1.11k
  double qp_temp = dQP + bitdepth_luma_qp_scale - SHIFT_QP;
469
  // Case #1: I or P-slices (key-frame)
470
1.11k
  double dQPFactor = gopEntry.m_QPFactor;
471
1.11k
  if( slice->sliceType == VVENC_I_SLICE )
472
1.11k
  {
473
1.11k
    if (m_pcEncCfg->m_dIntraQpFactor>=0.0 && gopEntry.m_sliceType != 'I')
474
0
    {
475
0
      dQPFactor = m_pcEncCfg->m_dIntraQpFactor;
476
0
    }
477
1.11k
    else
478
1.11k
    {
479
1.11k
      dQPFactor = 0.57;
480
1.11k
      if( ! m_pcEncCfg->m_lambdaFromQPEnable )
481
0
      {
482
0
        const int NumberBFrames = ( m_pcEncCfg->m_GOPSize - 1 );
483
0
        const double dLambda_scale = 1.0 - Clip3( 0.0, 0.5, 0.05 * (double)NumberBFrames );
484
0
        dQPFactor *= dLambda_scale;
485
0
      }
486
1.11k
    }
487
1.11k
  }
488
0
  else if( m_pcEncCfg->m_lambdaFromQPEnable )
489
0
  {
490
0
    dQPFactor=0.57;
491
0
  }
492
493
1.11k
  double dLambda = dQPFactor*pow( 2.0, qp_temp/3.0 );
494
495
1.11k
  if( !(m_pcEncCfg->m_lambdaFromQPEnable) && depth>0 )
496
0
  {
497
0
    double qp_temp_ref = refQP + bitdepth_luma_qp_scale - SHIFT_QP;
498
0
    dLambda *= Clip3(2.00, 4.00, (qp_temp_ref / 6.0));   // (j == B_SLICE && p_cur_frm->layer != 0 )
499
0
  }
500
501
  // if hadamard is used in ME process
502
1.11k
  if ( !m_pcEncCfg->m_bUseHADME && slice->sliceType != VVENC_I_SLICE )
503
0
  {
504
0
    dLambda *= 0.95;
505
0
  }
506
507
1.11k
  double lambdaModifier;
508
1.11k
  if( slice->sliceType != VVENC_I_SLICE || intraLambdaModifiers.empty())
509
1.11k
  {
510
1.11k
    lambdaModifier = m_pcEncCfg->m_adLambdaModifier[ temporalId ];
511
1.11k
  }
512
0
  else
513
0
  {
514
0
    lambdaModifier = intraLambdaModifiers[ (temporalId < intraLambdaModifiers.size()) ? temporalId : (intraLambdaModifiers.size()-1) ];
515
0
  }
516
1.11k
  dLambda *= lambdaModifier;
517
518
1.11k
  iQP = Clip3( -slice->sps->qpBDOffset[ CH_L ], MAX_QP, (int) floor( dQP + 0.5 ) );
519
520
1.11k
  if( m_pcEncCfg->m_DepQuantEnabled )
521
1.11k
  {
522
1.11k
    dLambda *= pow( 2.0, 0.25/3.0 ); // slight lambda adjustment for dependent quantization (due to different slope of quantizer)
523
1.11k
  }
524
525
  // NOTE: the lambda modifiers that are sometimes applied later might be best always applied in here.
526
1.11k
  return dLambda;
527
1.11k
}
528
529
530
// ====================================================================================================================
531
// Public member functions
532
// ====================================================================================================================
533
534
535
/** \param pic   picture class
536
 */
537
void EncSlice::compressSlice( Picture* pic )
538
1.11k
{
539
1.11k
  PROFILER_SCOPE_AND_STAGE( 1, g_timeProfiler, P_COMPRESS_SLICE );
540
1.11k
  CodingStructure& cs         = *pic->cs;
541
1.11k
  Slice* const slice          = cs.slice;
542
1.11k
  uint32_t  startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
543
1.11k
  uint32_t  boundingCtuTsAddr = pic->cs->pcv->sizeInCtus;
544
545
1.11k
  cs.pcv      = slice->pps->pcv;
546
1.11k
  cs.fracBits = 0;
547
548
1.11k
  if( startCtuTsAddr == 0 )
549
1.11k
  {
550
1.11k
    cs.initStructData( slice->sliceQp );
551
1.11k
  }
552
553
1.11k
  for( auto* thrRsrc : m_ThreadRsrc )
554
4.44k
  {
555
4.44k
    thrRsrc->m_encCu.initSlice( slice );
556
4.44k
  }
557
558
1.11k
  for( auto* lnRsrc : m_TileLineEncRsrc )
559
1.95k
  {
560
1.95k
    lnRsrc->m_CABACEstimator    .initCtxModels( *slice );
561
1.95k
    lnRsrc->m_SaoCABACEstimator .initCtxModels( *slice );
562
1.95k
    lnRsrc->m_AlfCABACEstimator .initCtxModels( *slice );
563
1.95k
    lnRsrc->m_AffineProfList    .resetAffineMVList();
564
1.95k
    lnRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
565
1.95k
    lnRsrc->m_CachedBvs         .resetIbcBvCand();
566
567
1.95k
    if( slice->sps->saoEnabled && pic->useSAO )
568
1.95k
    {
569
1.95k
      lnRsrc->m_encSao          .initSlice( slice );
570
1.95k
    }
571
1.95k
  }
572
573
1.11k
  if( slice->sps->fpelMmvd && !slice->picHeader->disFracMMVD )
574
1.11k
  {
575
1.11k
    slice->picHeader->disFracMMVD = ( pic->lwidth() * pic->lheight() > 1920 * 1080 ) ? true : false;
576
1.11k
  }
577
578
1.11k
  xProcessCtus( pic, startCtuTsAddr, boundingCtuTsAddr );
579
1.11k
}
580
581
void setJointCbCrModes( CodingStructure& cs, const Position topLeftLuma, const Size sizeLuma )
582
1.11k
{
583
1.11k
  bool              sgnFlag = true;
584
585
1.11k
  if( isChromaEnabled( cs.picture->chromaFormat) )
586
1.11k
  {
587
1.11k
    const CompArea  cbArea  = CompArea( COMP_Cb, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true );
588
1.11k
    const CompArea  crArea  = CompArea( COMP_Cr, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true );
589
590
1.11k
    const CPelBuf   orgCb   = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( cbArea ): cs.picture->getOrigBuf( cbArea );
591
1.11k
    const CPelBuf   orgCr   = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( crArea ): cs.picture->getOrigBuf( crArea );
592
1.11k
    const int       x0      = ( cbArea.x > 0 ? 0 : 1 );
593
1.11k
    const int       y0      = ( cbArea.y > 0 ? 0 : 1 );
594
1.11k
    const int       x1      = ( cbArea.x + cbArea.width  < cs.picture->Cb().width  ? cbArea.width  : cbArea.width  - 1 );
595
1.11k
    const int       y1      = ( cbArea.y + cbArea.height < cs.picture->Cb().height ? cbArea.height : cbArea.height - 1 );
596
1.11k
    const int       cbs     = orgCb.stride;
597
1.11k
    const int       crs     = orgCr.stride;
598
1.11k
    const Pel*      pCb     = orgCb.buf + y0 * cbs;
599
1.11k
    const Pel*      pCr     = orgCr.buf + y0 * crs;
600
1.11k
    int64_t         sumCbCr = 0;
601
602
    // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes
603
82.8k
    for( int y = y0; y < y1; y++, pCb += cbs, pCr += crs )
604
81.6k
    {
605
6.42M
      for( int x = x0; x < x1; x++ )
606
6.34M
      {
607
6.34M
        int cb = ( 12*(int)pCb[x] - 2*((int)pCb[x-1] + (int)pCb[x+1] + (int)pCb[x-cbs] + (int)pCb[x+cbs]) - ((int)pCb[x-1-cbs] + (int)pCb[x+1-cbs] + (int)pCb[x-1+cbs] + (int)pCb[x+1+cbs]) );
608
6.34M
        int cr = ( 12*(int)pCr[x] - 2*((int)pCr[x-1] + (int)pCr[x+1] + (int)pCr[x-crs] + (int)pCr[x+crs]) - ((int)pCr[x-1-crs] + (int)pCr[x+1-crs] + (int)pCr[x-1+crs] + (int)pCr[x+1+crs]) );
609
6.34M
        sumCbCr += cb*cr;
610
6.34M
      }
611
81.6k
    }
612
613
1.11k
    sgnFlag = ( sumCbCr < 0 );
614
1.11k
  }
615
616
1.11k
  cs.slice->picHeader->jointCbCrSign = sgnFlag;
617
1.11k
}
618
619
struct CtuPos
620
{
621
  const int ctuPosX;
622
  const int ctuPosY;
623
  const int ctuRsAddr;
624
625
3.46k
  CtuPos( int _x, int _y, int _a ) : ctuPosX( _x ), ctuPosY( _y ), ctuRsAddr( _a ) {}
626
};
627
628
class CtuTsIterator
629
{
630
  private:
631
    const CodingStructure& cs;
632
    const int        m_startTsAddr;
633
    const int        m_endTsAddr;
634
    std::vector<int> m_ctuAddrMap;
635
          int        m_ctuTsAddr;
636
637
  private:
638
    int getNextTsAddr( const int _tsAddr ) const
639
3.46k
    {
640
3.46k
      const PreCalcValues& pcv  = *cs.pcv;
641
3.46k
      const int startSliceRsRow = m_startTsAddr / pcv.widthInCtus;
642
3.46k
      const int startSliceRsCol = m_startTsAddr % pcv.widthInCtus;
643
3.46k
      const int endSliceRsRow   = (m_endTsAddr - 1) / pcv.widthInCtus;
644
3.46k
      const int endSliceRsCol   = (m_endTsAddr - 1) % pcv.widthInCtus;
645
3.46k
            int ctuTsAddr = _tsAddr;
646
3.46k
      CHECK( ctuTsAddr > m_endTsAddr, "error: array index out of bounds" );
647
4.57k
      while( ctuTsAddr < m_endTsAddr )
648
3.46k
      {
649
3.46k
        ctuTsAddr++;
650
3.46k
        const int ctuRsAddr = ctuTsAddr; 
651
3.46k
        if( cs.slice->pps->rectSlice
652
3.46k
            && ( (ctuRsAddr / pcv.widthInCtus) < startSliceRsRow
653
3.46k
              || (ctuRsAddr / pcv.widthInCtus) > endSliceRsRow
654
2.35k
              || (ctuRsAddr % pcv.widthInCtus) < startSliceRsCol
655
2.35k
              || (ctuRsAddr % pcv.widthInCtus) > endSliceRsCol ) )
656
1.11k
          continue;
657
2.35k
        break;
658
3.46k
      }
659
3.46k
      return ctuTsAddr;
660
3.46k
    }
661
662
    int mapAddr( const int _addr ) const
663
3.46k
    {
664
3.46k
      if( _addr < 0 )
665
0
        return _addr;
666
3.46k
      if( _addr >= m_ctuAddrMap.size() )
667
0
        return _addr;
668
3.46k
      return m_ctuAddrMap[ _addr ];
669
3.46k
    }
670
671
  public:
672
1.11k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e,       std::vector<int>& _m         ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {}
673
0
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, bool _wpp                          ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ),                     m_ctuTsAddr( _s ) { if( _wpp ) setWppPattern(); }
674
0
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m         ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {}
675
1.11k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m, int _c ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( std::max( _s, _c ) ) {}
676
1.11k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>* _m, bool _wpp ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuTsAddr( _s ) {  if( _wpp ) m_ctuAddrMap = *_m;  }
677
678
7.90k
    virtual ~CtuTsIterator() { m_ctuAddrMap.clear(); }
679
680
3.46k
    CtuTsIterator& operator++()                { m_ctuTsAddr = getNextTsAddr( m_ctuTsAddr ); return *this; }
681
0
    CtuTsIterator  operator++(int)             { auto retval = *this; ++(*this); return retval; }
682
0
    bool operator==(CtuTsIterator other) const { return m_ctuTsAddr == other.m_ctuTsAddr; }
683
4.57k
    bool operator!=(CtuTsIterator other) const { return m_ctuTsAddr != other.m_ctuTsAddr; }
684
3.46k
    CtuPos operator*()                   const { const int ctuRsAddr = mapAddr( m_ctuTsAddr );  return CtuPos( ctuRsAddr % cs.pcv->widthInCtus, ctuRsAddr / cs.pcv->widthInCtus, ctuRsAddr ); }
685
686
1.11k
    CtuTsIterator begin() { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap ); };
687
1.11k
    CtuTsIterator end()   { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap, m_endTsAddr ); };
688
689
    using iterator_category = std::forward_iterator_tag;
690
    using value_type        = int;
691
    using pointer           = int*;
692
    using reference         = int&;
693
    using difference_type   = ptrdiff_t;
694
695
    void setWppPattern()
696
0
    {
697
0
      const PreCalcValues& pcv = *cs.pcv;
698
0
      m_ctuAddrMap.resize( pcv.sizeInCtus, 0 );
699
0
      int addr = 0;
700
0
      for( int i = 1; i < pcv.sizeInCtus; i++ )
701
0
      {
702
0
        int x = addr % pcv.widthInCtus;
703
0
        int y = addr / pcv.widthInCtus;
704
0
        x -= 1;
705
0
        y += 1;
706
0
        if( x < 0 || y >= pcv.heightInCtus )
707
0
        {
708
0
          x += 1 + y;
709
0
          y  = 0;
710
0
        }
711
0
        if( x >= pcv.widthInCtus )
712
0
        {
713
0
          y += ( x - pcv.widthInCtus ) + 1;
714
0
          x  = pcv.widthInCtus - 1;
715
0
        }
716
0
        addr = y * pcv.widthInCtus + x;
717
0
        m_ctuAddrMap[ i ] = addr;
718
0
      }
719
0
    }
720
};
721
722
void EncSlice::saoDisabledRate( CodingStructure& cs, SAOBlkParam* reconParams )
723
0
{
724
0
  EncSampleAdaptiveOffset::disabledRate( cs, m_saoDisabledRate, reconParams, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
725
0
}
726
727
void EncSlice::finishCompressSlice( Picture* pic, Slice& slice )
728
1.11k
{
729
1.11k
  CodingStructure& cs = *pic->cs;
730
731
  // finalize
732
1.11k
  if( slice.sps->saoEnabled && pic->useSAO )
733
1.11k
  {
734
    // store disabled statistics
735
1.11k
    if( !m_pcEncCfg->m_numThreads )
736
0
      saoDisabledRate( cs, &m_saoReconParams[ 0 ] );
737
738
    // set slice header flags
739
1.11k
    CHECK( m_saoEnabled[ COMP_Cb ] != m_saoEnabled[ COMP_Cr ], "Unspecified error");
740
1.11k
    for( auto s : pic->slices )
741
1.11k
    {
742
1.11k
      s->saoEnabled[ CH_L ] = m_saoEnabled[ COMP_Y  ];
743
1.11k
      s->saoEnabled[ CH_C ] = m_saoEnabled[ COMP_Cb ];
744
1.11k
    }
745
1.11k
  }
746
1.11k
}
747
748
void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const unsigned boundingCtuTsAddr )
749
1.11k
{
750
1.11k
  PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs );
751
1.11k
  CodingStructure& cs      = *pic->cs;
752
1.11k
  Slice&           slice   = *cs.slice;
753
1.11k
  const PreCalcValues& pcv = *cs.pcv;
754
755
  // initialization
756
1.11k
  if( slice.sps->jointCbCr )
757
1.11k
  {
758
1.11k
    setJointCbCrModes( cs, Position(0, 0), cs.area.lumaSize() );
759
1.11k
  }
760
761
1.11k
  if( slice.sps->saoEnabled && pic->useSAO )
762
1.11k
  {
763
    // check SAO enabled or disabled
764
1.11k
    EncSampleAdaptiveOffset::decidePicParams( cs, m_saoDisabledRate, m_saoEnabled, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
765
766
1.11k
    m_saoAllDisabled = true;
767
4.44k
    for( int compIdx = 0; compIdx < getNumberValidComponents( pcv.chrFormat ); compIdx++ )
768
3.33k
    {
769
3.33k
      m_saoAllDisabled &= ! m_saoEnabled[ compIdx ];
770
3.33k
    }
771
772
1.11k
    std::fill( m_saoReconParams.begin(), m_saoReconParams.end(), SAOBlkParam() );
773
1.11k
  }
774
0
  else
775
0
  {
776
0
    m_saoAllDisabled = true;
777
0
  }
778
779
1.11k
  if( slice.sps->alfEnabled )
780
1.11k
  {
781
1.11k
    m_pALF->initEncProcess( slice );
782
1.11k
  }
783
784
1.11k
  std::fill( m_processStates.begin(), m_processStates.end(), CTU_ENCODE );
785
786
  // fill encoder parameter list
787
1.11k
  int idx = 0;
788
1.11k
  const std::vector<int> base = slice.sliceMap.ctuAddrInSlice;
789
1.11k
  auto ctuIter = CtuTsIterator( cs, startCtuTsAddr, boundingCtuTsAddr, &m_ctuAddrMap, m_pcEncCfg->m_numThreads > 0 );
790
1.11k
  for( auto ctuPos : ctuIter )
791
3.46k
  {
792
3.46k
    ctuEncParams[ idx ].pic       = pic;
793
3.46k
    ctuEncParams[ idx ].encSlice  = this;
794
3.46k
    ctuEncParams[ idx ].ctuRsAddr = ctuPos.ctuRsAddr;
795
3.46k
    ctuEncParams[ idx ].ctuPosX   = ctuPos.ctuPosX;
796
3.46k
    ctuEncParams[ idx ].ctuPosY   = ctuPos.ctuPosY;
797
3.46k
    ctuEncParams[ idx ].ctuArea   = UnitArea( pic->chromaFormat, slice.pps->pcv->getCtuArea( ctuPos.ctuPosX, ctuPos.ctuPosY ) );
798
799
3.46k
    if( m_pcEncCfg->m_numThreads > 0 )
800
3.46k
    {
801
3.46k
      ctuEncParams[idx].tileLineResIdx = slice.pps->getTileLineId( ctuPos.ctuPosX, ctuPos.ctuPosY );
802
3.46k
    }
803
0
    else
804
0
    {
805
0
      ctuEncParams[idx].tileLineResIdx = 0;
806
0
    }
807
3.46k
    idx++;
808
3.46k
  }
809
810
  //for( int i = 0; i < idx; i++ )
811
  //{
812
  //  for( int j = i; j < idx; j++ )
813
  //  {
814
  //    if( ctuEncParams[i].tileLineResIdx != ctuEncParams[j].tileLineResIdx ) continue;
815
  //
816
  //    CHECK( ctuEncParams[i].ctuPosY != ctuEncParams[j].ctuPosY, "Not the same CTU line!" );
817
  //    CHECK( slice.pps->getTileIdx( ctuEncParams[i].ctuPosX, ctuEncParams[i].ctuPosY ) != slice.pps->getTileIdx( ctuEncParams[j].ctuPosX, ctuEncParams[j].ctuPosY ), "Not the same tile!" );
818
  //  }
819
  //}
820
821
1.11k
  CHECK( idx != pcv.sizeInCtus, "array index out of bounds" );
822
823
  // process ctu's until last ctu is done
824
1.11k
  if( m_pcEncCfg->m_numThreads > 0 )
825
1.11k
  {
826
1.11k
    for( auto& ctuEncParam : ctuEncParams )
827
3.46k
    {
828
3.46k
      m_threadPool->addBarrierTask( EncSlice::xProcessCtuTask<false>,
829
3.46k
                                    &ctuEncParam,
830
3.46k
                                    m_ctuTasksDoneCounter,
831
3.46k
                                    nullptr,
832
3.46k
                                    {},
833
3.46k
                                    EncSlice::xProcessCtuTask<true> );
834
3.46k
    }
835
1.11k
  }
836
0
  else
837
0
  {
838
0
    do
839
0
    {
840
0
      for( auto& ctuEncParam : ctuEncParams )
841
0
      {
842
0
        if( m_processStates[ctuEncParam.ctuRsAddr] != PROCESS_DONE )
843
0
          EncSlice::xProcessCtuTask<false>( 0, &ctuEncParam );
844
0
      }
845
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_LUMA_LF,   cs, cs.getRecoBuf(), COMP_Y  );
846
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cb );
847
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cr );
848
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_LUMA_SAO,   cs, cs.getRecoBuf(), COMP_Y  );
849
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cb );
850
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cr );
851
0
    }
852
0
    while( m_processStates[ boundingCtuTsAddr - 1 ] != PROCESS_DONE );
853
0
  }
854
1.11k
}
855
856
inline bool checkCtuTaskNbTop( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
857
676k
{
858
676k
  return ctuPosY > 0 && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus ] <= tskType;
859
676k
}
860
861
inline bool checkCtuTaskNbBot( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
862
282k
{
863
282k
  return ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, 1 ) ) && processStates[ ctuRsAddr     + pps.pcv->widthInCtus ] <= tskType;
864
282k
}
865
866
inline bool checkCtuTaskNbRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
867
551k
{
868
551k
  return ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, 0 ) ) && processStates[ ctuRsAddr + 1 ] <= tskType;
869
551k
}
870
871
inline bool checkCtuTaskNbTopRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
872
227k
{
873
227k
  return ctuPosY > 0 && ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus + 1 ] <= tskType;
874
227k
}
875
876
inline bool checkCtuTaskNbBotRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, const int rightOffset = 1, bool override = false )
877
5.77M
{
878
5.77M
  return ctuPosX + rightOffset < pps.pcv->widthInCtus && ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, rightOffset, 1 ) ) && processStates[ ctuRsAddr + rightOffset + pps.pcv->widthInCtus ] <= tskType;
879
5.77M
}
880
881
template<bool checkReadyState>
882
bool EncSlice::xProcessCtuTask( int threadIdx, void* taskParam )
883
88.5M
{
884
88.5M
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
88.5M
  Picture* pic                   = ctuEncParam->pic;
886
88.5M
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
88.5M
  CodingStructure& cs            = *pic->cs;
888
88.5M
  Slice&           slice         = *cs.slice;
889
88.5M
  const PPS&       pps           = *slice.pps;
890
88.5M
  const PreCalcValues& pcv       = *cs.pcv;
891
88.5M
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
88.5M
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
88.5M
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
88.5M
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
88.5M
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
88.5M
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
88.5M
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
88.5M
  const int ctuStride            = pcv.widthInCtus;
899
88.5M
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
88.5M
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
88.5M
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
88.5M
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
88.5M
  const TaskType currState       = processStates[ ctuRsAddr ];
904
88.5M
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
88.5M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
88.5M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
88.5M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
88.5M
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
88.5M
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
88.5M
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
49.3M
    return false;
916
917
39.2M
  switch( currState )
918
39.2M
  {
919
    // encode
920
20.0M
    case CTU_ENCODE:
921
20.0M
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
20.0M
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
20.0M
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
20.0M
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
16.7M
          return false;
937
3.30M
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
3.29M
          return false;
939
        
940
6.89k
        if( checkReadyState )
941
3.45k
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
3.43k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
3.43k
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
3.43k
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
3.43k
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
3.43k
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
3.43k
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
3.46k
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
1.95k
        {
960
1.95k
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
1.95k
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
1.95k
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
1.95k
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
1.95k
        }
965
966
3.43k
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
3.43k
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
3.43k
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
3.43k
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
11.0M
    case RESHAPE_LF_VER:
975
11.0M
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
11.0M
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
11.0M
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
11.0M
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
11.0M
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
11.0M
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
16.8M
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
11.0M
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
5.31M
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
5.76M
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
5.76M
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
5.75M
          return false;
1005
1006
6.48k
        if( checkReadyState )
1007
3.44k
          return true;
1008
1009
3.04k
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
3.04k
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
3.04k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
3.46k
        {
1023
3.46k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
3.46k
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
3.46k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
3.46k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
3.46k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
3.46k
        }
1032
1033
3.04k
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
3.04k
        processStates[ ctuRsAddr ] = LF_HOR;
1036
3.04k
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
370k
    case LF_HOR:
1041
370k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
370k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
107k
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
262k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
256k
          return false;
1051
1052
6.63k
        if( checkReadyState )
1053
3.45k
          return true;
1054
1055
3.18k
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
3.18k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
3.44k
        {
1059
3.44k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
3.44k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
3.44k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
3.44k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
3.44k
        }
1064
1065
3.18k
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
3.18k
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
3.18k
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
301k
    case SAO_FILTER:
1073
301k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
301k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
226k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
199k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
189k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
9.89k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
6.81k
        if( checkReadyState )
1086
3.46k
          return true;
1087
1088
3.35k
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
3.45k
        if( slice.sps->saoEnabled && pic->useSAO )
1092
3.45k
        {
1093
3.45k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
3.45k
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
3.45k
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
3.45k
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
3.45k
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
3.45k
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
3.45k
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
3.45k
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
3.45k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
3.45k
        }
1104
1105
        // ALF border extension
1106
3.35k
        if( cs.sps->alfEnabled )
1107
3.46k
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
3.46k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
3.46k
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
3.46k
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
3.46k
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
3.46k
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
3.46k
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
3.46k
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
3.46k
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
3.46k
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
3.46k
        }
1122
1123
        // DMVR refinement can be stored now
1124
3.46k
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
3.46k
        {
1126
3.46k
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
3.46k
        }
1128
3.35k
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
3.35k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
3.35k
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
3.35k
        if( ctuPosX == lastCtuColInTileRow )
1133
1.95k
        {
1134
1.95k
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
1.95k
        }
1136
1.40k
        else
1137
1.40k
        {
1138
1.40k
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
1.40k
          return true;
1140
1.40k
        }
1141
3.35k
      }
1142
1.95k
      break;
1143
1144
89.1k
    case ALF_GET_STATISTICS:
1145
89.1k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
89.1k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
89.1k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
3.90k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
3.90k
        if( checkReadyState )
1153
1.95k
          return true;
1154
1155
1.94k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
1.94k
        if( slice.sps->alfEnabled )
1159
1.94k
        {
1160
1.94k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
1.94k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
1.94k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
5.40k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
3.46k
          {
1165
3.46k
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
3.46k
          }
1167
1.94k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
1.94k
        }
1169
1170
1.94k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
1.94k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
1.94k
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
1.94k
      }
1176
0
      break;
1177
1178
815k
    case ALF_DERIVE_FILTER:
1179
815k
      {
1180
815k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
815k
        if( ctuRsAddr == deriveFilterCtu )
1182
815k
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
815k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
824k
          for( int y = 0; y < numCheckLines; y++ )
1186
821k
          {
1187
830k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
821k
            {
1189
821k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
821k
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
813k
                return false;
1192
821k
            }
1193
821k
          }
1194
815k
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
2.22k
        if( checkReadyState )
1202
1.11k
          return true;
1203
1204
1.11k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
1.11k
        if( slice.sps->alfEnabled )
1207
1.11k
        {
1208
1.11k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
1.11k
          if( ctuRsAddr == deriveFilterCtu )
1210
1.11k
          {
1211
1.11k
            encSlice->m_pALF->initDerivation( slice );
1212
1.11k
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
1.11k
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
1.11k
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
1.11k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
1.11k
        }
1228
1229
1.11k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
1.11k
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
1.11k
      }
1232
0
      break;
1233
1234
6.38M
    case ALF_RECONSTRUCT:
1235
6.38M
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
6.38M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
6.38M
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
6.38M
          return false;
1240
3.89k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
3.89k
        if( checkReadyState )
1249
1.94k
          return true;
1250
1251
1.95k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
1.95k
        if( slice.sps->alfEnabled )
1254
1.95k
        {
1255
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
1.95k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
5.41k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
3.46k
          {
1259
3.46k
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
3.46k
          }
1261
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
1.95k
        }
1263
1264
1.95k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
1.95k
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
1.95k
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
5.37k
    case CCALF_GET_STATISTICS:
1271
5.37k
      {
1272
5.37k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
3.64k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
2.67k
        if( checkReadyState )
1276
727
          return true;
1277
1278
1.95k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
1.95k
        if( slice.sps->ccalfEnabled )
1282
1.95k
        {
1283
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
1.95k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
5.41k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
3.46k
          {
1287
3.46k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
3.46k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
3.46k
          }
1290
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
1.95k
        }
1292
1293
1.95k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
1.95k
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
1.95k
      }
1298
0
      break;
1299
1300
81.5k
    case CCALF_DERIVE_FILTER:
1301
81.5k
      {
1302
        // synchronization dependencies
1303
81.5k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
81.5k
        if( ctuRsAddr == deriveFilterCtu )
1305
81.5k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
81.5k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
87.1k
          for( int y = 0; y < numCheckLines; y++ )
1309
84.9k
          {
1310
90.6k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
84.9k
            {
1312
84.9k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
84.9k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
79.3k
                return false;
1315
84.9k
            }
1316
84.9k
          }
1317
81.5k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
2.22k
        if( checkReadyState )
1325
1.11k
          return true;
1326
1327
1.11k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
1.11k
        if( slice.sps->ccalfEnabled )
1331
1.11k
        {
1332
1.11k
          if( ctuRsAddr == deriveFilterCtu )
1333
1.11k
          {
1334
1.11k
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
1.11k
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
1.11k
        }
1346
1.11k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
1.11k
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
1.11k
      }
1350
0
      break;
1351
1352
8.74k
    case CCALF_RECONSTRUCT:
1353
8.74k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
8.74k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
8.74k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
4.84k
          return false;
1358
1359
3.90k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
3.90k
        if( checkReadyState )
1368
1.95k
          return true;
1369
1370
1.94k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
1.94k
        if( slice.sps->ccalfEnabled )
1373
1.94k
        {
1374
1.94k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
5.40k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
3.46k
          {
1377
3.46k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
3.46k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
3.46k
          }
1380
1.94k
        }
1381
1382
1.94k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
1.94k
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
1.94k
        {
1388
1.94k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
1.94k
          const int margin = cs.picture->margin;
1390
1.94k
          recoBuf.extendBorderPelLft( y, height, margin );
1391
1.94k
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
1.94k
          if(ctuPosY == 0)
1393
1.11k
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
1.94k
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
1.11k
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
1.94k
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
1.94k
        }
1401
1402
        // perform finish only once for whole picture
1403
1.94k
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
1.94k
        if( ctuRsAddr < finishCtu )
1405
843
        {
1406
843
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
843
          return true;
1409
843
        }
1410
1.10k
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
1.10k
      }
1412
1413
14.7k
    case FINISH_SLICE:
1414
14.7k
      {
1415
14.7k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
31.1k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
29.5k
          if( processStates[ i ] < FINISH_SLICE )
1420
13.0k
            return false;
1421
1422
1.64k
        if( checkReadyState )
1423
540
          return true;
1424
1425
1.10k
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
1.10k
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
1.10k
        return true;
1430
1.64k
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
39.2M
  }
1440
1441
18.4k
  return false;
1442
39.2M
}
bool vvenc::EncSlice::xProcessCtuTask<false>(int, void*)
Line
Count
Source
883
23.1k
{
884
23.1k
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
23.1k
  Picture* pic                   = ctuEncParam->pic;
886
23.1k
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
23.1k
  CodingStructure& cs            = *pic->cs;
888
23.1k
  Slice&           slice         = *cs.slice;
889
23.1k
  const PPS&       pps           = *slice.pps;
890
23.1k
  const PreCalcValues& pcv       = *cs.pcv;
891
23.1k
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
23.1k
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
23.1k
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
23.1k
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
23.1k
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
23.1k
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
23.1k
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
23.1k
  const int ctuStride            = pcv.widthInCtus;
899
23.1k
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
23.1k
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
23.1k
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
23.1k
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
23.1k
  const TaskType currState       = processStates[ ctuRsAddr ];
904
23.1k
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
23.1k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
23.1k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
23.1k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
23.1k
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
23.1k
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
23.1k
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
0
    return false;
916
917
23.1k
  switch( currState )
918
23.1k
  {
919
    // encode
920
3.46k
    case CTU_ENCODE:
921
3.46k
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
3.46k
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
3.46k
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
3.46k
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
0
          return false;
937
3.46k
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
0
          return false;
939
        
940
3.46k
        if( checkReadyState )
941
0
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
3.46k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
3.46k
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
3.46k
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
3.46k
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
3.46k
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
3.46k
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
3.46k
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
1.95k
        {
960
1.95k
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
1.95k
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
1.95k
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
1.95k
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
1.95k
        }
965
966
3.46k
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
3.46k
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
3.46k
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
3.46k
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
3.46k
    case RESHAPE_LF_VER:
975
3.46k
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
3.46k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
3.46k
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
3.46k
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
3.46k
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
3.46k
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
6.92k
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
3.46k
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
0
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
3.46k
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
3.46k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
0
          return false;
1005
1006
3.46k
        if( checkReadyState )
1007
0
          return true;
1008
1009
3.46k
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
3.46k
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
3.46k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
3.46k
        {
1023
3.46k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
3.46k
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
3.46k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
3.46k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
3.46k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
3.46k
        }
1032
1033
3.46k
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
3.46k
        processStates[ ctuRsAddr ] = LF_HOR;
1036
3.46k
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
3.44k
    case LF_HOR:
1041
3.44k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
3.44k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
0
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
3.44k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
0
          return false;
1051
1052
3.44k
        if( checkReadyState )
1053
0
          return true;
1054
1055
3.44k
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
3.44k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
3.44k
        {
1059
3.44k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
3.44k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
3.44k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
3.44k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
3.44k
        }
1064
1065
3.44k
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
3.44k
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
3.44k
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
3.45k
    case SAO_FILTER:
1073
3.45k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
3.45k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
3.45k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
3.45k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
3.45k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
3.45k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
3.45k
        if( checkReadyState )
1086
0
          return true;
1087
1088
3.45k
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
3.45k
        if( slice.sps->saoEnabled && pic->useSAO )
1092
3.45k
        {
1093
3.45k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
3.45k
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
3.45k
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
3.45k
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
3.45k
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
3.45k
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
3.45k
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
3.45k
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
3.45k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
3.45k
        }
1104
1105
        // ALF border extension
1106
3.45k
        if( cs.sps->alfEnabled )
1107
3.46k
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
3.46k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
3.46k
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
3.46k
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
3.46k
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
3.46k
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
3.46k
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
3.46k
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
3.46k
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
3.46k
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
3.46k
        }
1122
1123
        // DMVR refinement can be stored now
1124
3.46k
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
3.46k
        {
1126
3.46k
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
3.46k
        }
1128
3.45k
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
3.45k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
3.45k
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
3.45k
        if( ctuPosX == lastCtuColInTileRow )
1133
1.95k
        {
1134
1.95k
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
1.95k
        }
1136
1.50k
        else
1137
1.50k
        {
1138
1.50k
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
1.50k
          return true;
1140
1.50k
        }
1141
3.45k
      }
1142
1.95k
      break;
1143
1144
1.95k
    case ALF_GET_STATISTICS:
1145
1.94k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
1.94k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
1.94k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
1.94k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
1.94k
        if( checkReadyState )
1153
0
          return true;
1154
1155
1.94k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
1.94k
        if( slice.sps->alfEnabled )
1159
1.94k
        {
1160
1.94k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
1.94k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
1.94k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
5.40k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
3.46k
          {
1165
3.46k
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
3.46k
          }
1167
1.94k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
1.94k
        }
1169
1170
1.94k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
1.94k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
1.94k
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
1.94k
      }
1176
0
      break;
1177
1178
1.11k
    case ALF_DERIVE_FILTER:
1179
1.11k
      {
1180
1.11k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.11k
        if( ctuRsAddr == deriveFilterCtu )
1182
1.11k
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.11k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
3.06k
          for( int y = 0; y < numCheckLines; y++ )
1186
1.95k
          {
1187
3.90k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
1.95k
            {
1189
1.95k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
1.95k
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
0
                return false;
1192
1.95k
            }
1193
1.95k
          }
1194
1.11k
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
1.11k
        if( checkReadyState )
1202
0
          return true;
1203
1204
1.11k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
1.11k
        if( slice.sps->alfEnabled )
1207
1.11k
        {
1208
1.11k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
1.11k
          if( ctuRsAddr == deriveFilterCtu )
1210
1.11k
          {
1211
1.11k
            encSlice->m_pALF->initDerivation( slice );
1212
1.11k
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
1.11k
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
1.11k
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
1.11k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
1.11k
        }
1228
1229
1.11k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
1.11k
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
1.11k
      }
1232
0
      break;
1233
1234
1.95k
    case ALF_RECONSTRUCT:
1235
1.95k
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
1.95k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
1.95k
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
0
          return false;
1240
1.95k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
1.95k
        if( checkReadyState )
1249
0
          return true;
1250
1251
1.95k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
1.95k
        if( slice.sps->alfEnabled )
1254
1.95k
        {
1255
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
1.95k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
5.41k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
3.46k
          {
1259
3.46k
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
3.46k
          }
1261
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
1.95k
        }
1263
1264
1.95k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
1.95k
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
1.95k
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
2.68k
    case CCALF_GET_STATISTICS:
1271
2.68k
      {
1272
2.68k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
2.18k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
1.95k
        if( checkReadyState )
1276
0
          return true;
1277
1278
1.95k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
1.95k
        if( slice.sps->ccalfEnabled )
1282
1.95k
        {
1283
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
1.95k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
5.41k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
3.46k
          {
1287
3.46k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
3.46k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
3.46k
          }
1290
1.95k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
1.95k
        }
1292
1293
1.95k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
1.95k
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
1.95k
      }
1298
0
      break;
1299
1300
1.11k
    case CCALF_DERIVE_FILTER:
1301
1.11k
      {
1302
        // synchronization dependencies
1303
1.11k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
1.11k
        if( ctuRsAddr == deriveFilterCtu )
1305
1.11k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
1.11k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
3.06k
          for( int y = 0; y < numCheckLines; y++ )
1309
1.95k
          {
1310
3.90k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
1.95k
            {
1312
1.95k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
1.95k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
0
                return false;
1315
1.95k
            }
1316
1.95k
          }
1317
1.11k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
1.11k
        if( checkReadyState )
1325
0
          return true;
1326
1327
1.11k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
1.11k
        if( slice.sps->ccalfEnabled )
1331
1.11k
        {
1332
1.11k
          if( ctuRsAddr == deriveFilterCtu )
1333
1.11k
          {
1334
1.11k
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
1.11k
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
1.11k
        }
1346
1.11k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
1.11k
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
1.11k
      }
1350
0
      break;
1351
1352
1.94k
    case CCALF_RECONSTRUCT:
1353
1.94k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
1.94k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
1.94k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
0
          return false;
1358
1359
1.94k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
1.94k
        if( checkReadyState )
1368
0
          return true;
1369
1370
1.94k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
1.94k
        if( slice.sps->ccalfEnabled )
1373
1.94k
        {
1374
1.94k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
5.40k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
3.46k
          {
1377
3.46k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
3.46k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
3.46k
          }
1380
1.94k
        }
1381
1382
1.94k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
1.94k
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
1.94k
        {
1388
1.94k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
1.94k
          const int margin = cs.picture->margin;
1390
1.94k
          recoBuf.extendBorderPelLft( y, height, margin );
1391
1.94k
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
1.94k
          if(ctuPosY == 0)
1393
1.11k
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
1.94k
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
1.11k
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
1.94k
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
1.94k
        }
1401
1402
        // perform finish only once for whole picture
1403
1.94k
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
1.94k
        if( ctuRsAddr < finishCtu )
1405
843
        {
1406
843
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
843
          return true;
1409
843
        }
1410
1.10k
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
1.10k
      }
1412
1413
1.64k
    case FINISH_SLICE:
1414
1.64k
      {
1415
1.64k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
4.46k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
3.35k
          if( processStates[ i ] < FINISH_SLICE )
1420
540
            return false;
1421
1422
1.10k
        if( checkReadyState )
1423
0
          return true;
1424
1425
1.10k
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
1.10k
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
1.10k
        return true;
1430
1.10k
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
23.1k
  }
1440
1441
18.4k
  return false;
1442
23.1k
}
bool vvenc::EncSlice::xProcessCtuTask<true>(int, void*)
Line
Count
Source
883
88.5M
{
884
88.5M
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
88.5M
  Picture* pic                   = ctuEncParam->pic;
886
88.5M
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
88.5M
  CodingStructure& cs            = *pic->cs;
888
88.5M
  Slice&           slice         = *cs.slice;
889
88.5M
  const PPS&       pps           = *slice.pps;
890
88.5M
  const PreCalcValues& pcv       = *cs.pcv;
891
88.5M
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
88.5M
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
88.5M
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
88.5M
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
88.5M
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
88.5M
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
88.5M
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
88.5M
  const int ctuStride            = pcv.widthInCtus;
899
88.5M
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
88.5M
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
88.5M
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
88.5M
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
88.5M
  const TaskType currState       = processStates[ ctuRsAddr ];
904
88.5M
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
88.5M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
88.5M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
88.5M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
88.5M
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
88.5M
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
88.5M
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
49.3M
    return false;
916
917
39.1M
  switch( currState )
918
39.1M
  {
919
    // encode
920
20.0M
    case CTU_ENCODE:
921
20.0M
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
20.0M
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
20.0M
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
20.0M
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
16.7M
          return false;
937
3.30M
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
3.29M
          return false;
939
        
940
3.42k
        if( checkReadyState )
941
3.45k
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
18.4E
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
18.4E
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
18.4E
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
18.4E
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
18.4E
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
18.4E
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
0
        {
960
0
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
0
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
0
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
0
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
0
        }
965
966
18.4E
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
18.4E
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
18.4E
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
11.0M
    case RESHAPE_LF_VER:
975
11.0M
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
11.0M
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
11.0M
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
11.0M
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
11.0M
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
11.0M
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
16.8M
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
11.0M
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
5.31M
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
5.75M
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
5.75M
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
5.75M
          return false;
1005
1006
3.01k
        if( checkReadyState )
1007
3.44k
          return true;
1008
1009
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
18.4E
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
18.4E
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
0
        {
1023
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
0
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
0
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
0
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
0
        }
1032
1033
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
18.4E
        processStates[ ctuRsAddr ] = LF_HOR;
1036
18.4E
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
367k
    case LF_HOR:
1041
367k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
367k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
107k
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
259k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
256k
          return false;
1051
1052
3.18k
        if( checkReadyState )
1053
3.45k
          return true;
1054
1055
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
18.4E
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
0
        {
1059
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
0
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
0
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
0
        }
1064
1065
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
18.4E
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
18.4E
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
297k
    case SAO_FILTER:
1073
297k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
297k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
223k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
196k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
186k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
6.43k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
3.36k
        if( checkReadyState )
1086
3.46k
          return true;
1087
1088
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
18.4E
        if( slice.sps->saoEnabled && pic->useSAO )
1092
0
        {
1093
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
0
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
0
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
0
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
0
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
0
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
0
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
0
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
0
        }
1104
1105
        // ALF border extension
1106
18.4E
        if( cs.sps->alfEnabled )
1107
0
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
0
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
0
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
0
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
0
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
0
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
0
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
0
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
0
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
0
        }
1122
1123
        // DMVR refinement can be stored now
1124
18.4E
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
0
        {
1126
0
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
0
        }
1128
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
18.4E
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
18.4E
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
18.4E
        if( ctuPosX == lastCtuColInTileRow )
1133
0
        {
1134
0
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
0
        }
1136
18.4E
        else
1137
18.4E
        {
1138
18.4E
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
18.4E
          return true;
1140
18.4E
        }
1141
18.4E
      }
1142
0
      break;
1143
1144
87.2k
    case ALF_GET_STATISTICS:
1145
87.2k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
87.2k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
87.2k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
1.95k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
1.95k
        if( checkReadyState )
1153
1.95k
          return true;
1154
1155
1
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
1
        if( slice.sps->alfEnabled )
1159
0
        {
1160
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
0
          {
1165
0
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
0
          }
1167
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
0
        }
1169
1170
1
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
1
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
1
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
1
      }
1176
0
      break;
1177
1178
814k
    case ALF_DERIVE_FILTER:
1179
814k
      {
1180
814k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
814k
        if( ctuRsAddr == deriveFilterCtu )
1182
814k
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
814k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
820k
          for( int y = 0; y < numCheckLines; y++ )
1186
819k
          {
1187
826k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
819k
            {
1189
819k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
819k
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
813k
                return false;
1192
819k
            }
1193
819k
          }
1194
814k
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
1.11k
        if( checkReadyState )
1202
1.11k
          return true;
1203
1204
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
0
        if( slice.sps->alfEnabled )
1207
0
        {
1208
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
0
          if( ctuRsAddr == deriveFilterCtu )
1210
0
          {
1211
0
            encSlice->m_pALF->initDerivation( slice );
1212
0
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
0
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
0
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
0
        }
1228
1229
0
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
0
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
0
      }
1232
0
      break;
1233
1234
6.38M
    case ALF_RECONSTRUCT:
1235
6.38M
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
6.38M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
6.38M
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
6.38M
          return false;
1240
1.94k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
1.94k
        if( checkReadyState )
1249
1.94k
          return true;
1250
1251
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
18.4E
        if( slice.sps->alfEnabled )
1254
0
        {
1255
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
0
          {
1259
0
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
0
          }
1261
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
0
        }
1263
1264
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
18.4E
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
18.4E
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
2.69k
    case CCALF_GET_STATISTICS:
1271
2.69k
      {
1272
2.69k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
1.45k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
724
        if( checkReadyState )
1276
727
          return true;
1277
1278
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
18.4E
        if( slice.sps->ccalfEnabled )
1282
0
        {
1283
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
0
          {
1287
0
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
0
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
0
          }
1290
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
0
        }
1292
1293
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
18.4E
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
18.4E
      }
1298
0
      break;
1299
1300
80.4k
    case CCALF_DERIVE_FILTER:
1301
80.4k
      {
1302
        // synchronization dependencies
1303
80.4k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
80.4k
        if( ctuRsAddr == deriveFilterCtu )
1305
80.4k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
80.4k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
84.1k
          for( int y = 0; y < numCheckLines; y++ )
1309
83.0k
          {
1310
86.7k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
83.0k
            {
1312
83.0k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
83.0k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
79.3k
                return false;
1315
83.0k
            }
1316
83.0k
          }
1317
80.4k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
1.11k
        if( checkReadyState )
1325
1.11k
          return true;
1326
1327
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
0
        if( slice.sps->ccalfEnabled )
1331
0
        {
1332
0
          if( ctuRsAddr == deriveFilterCtu )
1333
0
          {
1334
0
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
0
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
0
        }
1346
0
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
0
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
0
      }
1350
0
      break;
1351
1352
6.79k
    case CCALF_RECONSTRUCT:
1353
6.79k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
6.79k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
6.79k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
4.84k
          return false;
1358
1359
1.95k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
1.95k
        if( checkReadyState )
1368
1.95k
          return true;
1369
1370
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
0
        if( slice.sps->ccalfEnabled )
1373
0
        {
1374
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
0
          {
1377
0
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
0
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
0
          }
1380
0
        }
1381
1382
0
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
0
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
0
        {
1388
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
0
          const int margin = cs.picture->margin;
1390
0
          recoBuf.extendBorderPelLft( y, height, margin );
1391
0
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
0
          if(ctuPosY == 0)
1393
0
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
0
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
0
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
0
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
0
        }
1401
1402
        // perform finish only once for whole picture
1403
0
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
0
        if( ctuRsAddr < finishCtu )
1405
0
        {
1406
0
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
0
          return true;
1409
0
        }
1410
0
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
0
      }
1412
1413
13.0k
    case FINISH_SLICE:
1414
13.0k
      {
1415
13.0k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
26.6k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
26.1k
          if( processStates[ i ] < FINISH_SLICE )
1420
12.5k
            return false;
1421
1422
540
        if( checkReadyState )
1423
540
          return true;
1424
1425
0
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
0
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
0
        return true;
1430
540
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
39.1M
  }
1440
1441
0
  return false;
1442
39.1M
}
1443
1444
void EncSlice::encodeSliceData( Picture* pic )
1445
1.11k
{
1446
1.11k
  CodingStructure& cs              = *pic->cs;
1447
1.11k
  Slice* const slice               = cs.slice;
1448
1.11k
  const uint32_t startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
1449
1.11k
  const uint32_t boundingCtuTsAddr = cs.pcv->sizeInCtus;
1450
1.11k
  const bool wavefrontsEnabled     = slice->sps->entropyCodingSyncEnabled;
1451
1452
  // this ensures that independently encoded bitstream chunks can be combined to bit-equal
1453
1.11k
  const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx;
1454
1.11k
  slice->encCABACTableIdx = cabacTableIdx;
1455
1456
  // initialise entropy coder for the slice
1457
1.11k
  m_CABACWriter.initCtxModels( *slice );
1458
1459
1.11k
  DTRACE( g_trace_ctx, D_HEADER, "=========== POC: %d ===========\n", slice->poc );
1460
1461
1.11k
  int prevQP[MAX_NUM_CH];
1462
1.11k
  prevQP[0] = prevQP[1] = slice->sliceQp;
1463
1464
1.11k
  const PreCalcValues& pcv        = *cs.pcv;
1465
1.11k
  const uint32_t widthInCtus      = pcv.widthInCtus;
1466
1.11k
  uint32_t uiSubStrm              = 0;
1467
1.11k
  const int numSubstreamsColumns  = slice->pps->numTileCols;
1468
1.11k
  const int numSubstreamRows      = slice->sps->entropyCodingSyncEnabled ? pic->cs->pcv->heightInCtus : slice->pps->numTileRows;
1469
1.11k
  const int numSubstreams         = std::max<int>( numSubstreamRows * numSubstreamsColumns, 0/*(int)pic->brickMap->bricks.size()*/ );
1470
1.11k
  std::vector<OutputBitstream> substreamsOut( numSubstreams );
1471
1472
1.11k
  slice->clearSubstreamSizes();
1473
1474
4.57k
  for( uint32_t ctuTsAddr = startCtuTsAddr; ctuTsAddr < boundingCtuTsAddr; ctuTsAddr++ )
1475
3.46k
  {
1476
3.46k
    const uint32_t ctuRsAddr            = slice->sliceMap.ctuAddrInSlice[ctuTsAddr];
1477
3.46k
    const uint32_t ctuXPosInCtus        = ctuRsAddr % widthInCtus;
1478
3.46k
    const uint32_t ctuYPosInCtus        = ctuRsAddr / widthInCtus;
1479
3.46k
    const uint32_t tileXPosInCtus       = slice->pps->tileColBd[cs.pps->ctuToTileCol[ctuXPosInCtus]];
1480
3.46k
    const uint32_t tileYPosInCtus       = slice->pps->tileRowBd[cs.pps->ctuToTileRow[ctuYPosInCtus]];
1481
1482
3.46k
    DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
1483
1484
3.46k
    const Position pos (ctuXPosInCtus * pcv.maxCUSize, ctuYPosInCtus * pcv.maxCUSize);
1485
3.46k
    const UnitArea ctuArea (cs.area.chromaFormat, Area(pos.x, pos.y, pcv.maxCUSize, pcv.maxCUSize));
1486
3.46k
    CHECK( uiSubStrm >= numSubstreams, "array index out of bounds" );
1487
3.46k
    m_CABACWriter.initBitstream( &substreamsOut[ uiSubStrm ] );
1488
1489
    // set up CABAC contexts' state for this CTU
1490
3.46k
    if (ctuXPosInCtus == tileXPosInCtus && ctuYPosInCtus == tileYPosInCtus )
1491
1.11k
    {
1492
1.11k
      if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset
1493
0
      {
1494
0
        m_CABACWriter.initCtxModels( *slice );
1495
0
      }
1496
1.11k
      prevQP[0] = prevQP[1] = slice->sliceQp;
1497
1.11k
    }
1498
2.35k
    else if (ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled)
1499
0
    {
1500
      // Synchronize cabac probabilities with upper-right CTU if it's available and at the start of a line.
1501
0
      if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset
1502
0
      {
1503
0
        m_CABACWriter.initCtxModels( *slice );
1504
0
      }
1505
0
      if( cs.getCURestricted( pos.offset( 0, -1 ), pos, slice->independentSliceIdx, slice->pps->getTileIdx( ctuXPosInCtus, ctuYPosInCtus ), CH_L, TREE_D ) )
1506
0
      {
1507
        // Top-right is available, so use it.
1508
0
        m_CABACWriter.getCtx() = m_entropyCodingSyncContextState;
1509
0
      }
1510
0
      prevQP[0] = prevQP[1] = slice->sliceQp;
1511
0
    }
1512
1513
3.46k
    m_CABACWriter.coding_tree_unit( cs, ctuArea, prevQP, ctuRsAddr );
1514
1515
    // store probabilities of second CTU in line into buffer
1516
3.46k
    if( ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled )
1517
0
    {
1518
0
      m_entropyCodingSyncContextState = m_CABACWriter.getCtx();
1519
0
    }
1520
1521
    // terminate the sub-stream, if required (end of slice-segment, end of tile, end of wavefront-CTU-row):
1522
3.46k
    bool isMoreCTUsinSlice = ctuTsAddr != (boundingCtuTsAddr - 1);
1523
3.46k
    bool isLastCTUinTile   = isMoreCTUsinSlice && slice->pps->getTileIdx( ctuRsAddr ) != slice->pps->getTileIdx( slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] );
1524
3.46k
    bool isLastCTUinWPP    = wavefrontsEnabled && isMoreCTUsinSlice && !isLastCTUinTile && ( (slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus) == cs.pps->tileColBd[cs.pps->ctuToTileCol[slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus]] ); //TODO: adjust tile bound condition
1525
1526
3.46k
    if (isLastCTUinWPP || !isMoreCTUsinSlice || isLastCTUinTile )         // this the the last CTU of either tile/brick/WPP/slice
1527
1.11k
    {
1528
1.11k
      m_CABACWriter.end_of_slice();
1529
1530
      // Byte-alignment in slice_data() when new tile
1531
1.11k
      substreamsOut[ uiSubStrm ].writeByteAlignment();
1532
1533
1.11k
      if (isMoreCTUsinSlice) //Byte alignment only when it is not the last substream in the slice
1534
0
      {
1535
        // write sub-stream size
1536
0
        slice->addSubstreamSize( ( substreamsOut[ uiSubStrm ].getNumberOfWrittenBits() >> 3 ) + substreamsOut[ uiSubStrm ].countStartCodeEmulations() );
1537
0
      }
1538
1.11k
      uiSubStrm++;
1539
1.11k
    }
1540
3.46k
  } // CTU-loop
1541
1542
1.11k
  if(slice->pps->cabacInitPresent)
1543
0
  {
1544
0
    m_encCABACTableIdx = m_CABACWriter.getCtxInitId( *slice );
1545
0
  }
1546
1.11k
  else
1547
1.11k
  {
1548
1.11k
    m_encCABACTableIdx = slice->sliceType;
1549
1.11k
  }
1550
1551
  // concatenate substreams
1552
1.11k
  OutputBitstream& outStream = pic->sliceDataStreams[ 0/*slice->sliceIdx*/ ];
1553
2.22k
  for ( int i = 0; i < slice->getNumberOfSubstreamSizes() + 1; i++ )
1554
1.11k
  {
1555
1.11k
    outStream.addSubstream( &(substreamsOut[ i ]) );
1556
1.11k
  }
1557
1.11k
  pic->sliceDataNumBins += m_CABACWriter.getNumBins();
1558
1.11k
}
1559
1560
} // namespace vvenc
1561
1562
//! \}
1563