Coverage Report

Created: 2026-05-30 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/EncoderLib/EncSlice.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     EncSlice.cpp
45
    \brief    slice encoder class
46
*/
47
48
#include "EncSlice.h"
49
#include "EncStage.h"
50
#include "EncLib.h"
51
#include "EncPicture.h"
52
#include "BitAllocation.h"
53
#include "CommonLib/UnitTools.h"
54
#include "CommonLib/Picture.h"
55
#include "CommonLib/TimeProfiler.h"
56
#include "CommonLib/dtrace_codingstruct.h"
57
#include "Utilities/NoMallocThreadPool.h"
58
59
#include <math.h>
60
#include "vvenc/vvencCfg.h"
61
62
//! \ingroup EncoderLib
63
//! \{
64
65
namespace vvenc {
66
67
#ifdef TRACE_ENABLE_ITT
68
static const __itt_domain* itt_domain_encode              = __itt_domain_create( "Encode" );
69
static const __itt_string_handle* itt_handle_ctuEncode    = __itt_string_handle_create( "Encode_CTU" );
70
static const __itt_string_handle* itt_handle_rspLfVer     = __itt_string_handle_create( "RspLfVer_CTU" );
71
static const __itt_string_handle* itt_handle_lfHor        = __itt_string_handle_create( "LfHor_CTU" );
72
static const __itt_string_handle* itt_handle_sao          = __itt_string_handle_create( "SAO_CTU" );
73
static const __itt_string_handle* itt_handle_alf_stat     = __itt_string_handle_create( "ALF_CTU_STAT" );
74
static const __itt_string_handle* itt_handle_alf_derive   = __itt_string_handle_create( "ALF_DERIVE" );
75
static const __itt_string_handle* itt_handle_alf_recon    = __itt_string_handle_create( "ALF_RECONSTRUCT" );
76
static const __itt_string_handle* itt_handle_ccalf_stat   = __itt_string_handle_create( "CCALF_CTU_STAT" );
77
static const __itt_string_handle* itt_handle_ccalf_derive = __itt_string_handle_create( "CCALF_DERIVE" );
78
static const __itt_string_handle* itt_handle_ccalf_recon  = __itt_string_handle_create( "CCALF_RECONSTRUCT" );
79
#endif
80
81
void setArbitraryWppPattern( const PreCalcValues& pcv, std::vector<int>& ctuAddrMap, int stepX = 1 )
82
5.19k
{
83
5.19k
  ctuAddrMap.resize( pcv.sizeInCtus, 0 );
84
5.19k
  std::vector<int> x_in_line( pcv.heightInCtus, 0 );
85
5.19k
  int x = 0, y = 0, addr = 0;
86
5.19k
  int y_top = 0;
87
5.19k
  const int step = stepX; // number of CTUs in x-direction to scan 
88
5.19k
  ctuAddrMap[addr++] = x++; // first entry (can be omitted)
89
12.1k
  while( addr < pcv.sizeInCtus )
90
7.87k
  {
91
    // fill entries in x-direction
92
7.87k
    int x1 = x;
93
18.8k
    while( x < std::min(x1 + step, (int)pcv.widthInCtus) )
94
11.0k
    {
95
      // general WPP condition (top-right CTU availability)
96
11.0k
      if( y > 0 && !( x_in_line[y - 1] - x >= 2 ) && x != pcv.widthInCtus - 1 )
97
0
        break;
98
11.0k
      ctuAddrMap[addr++] = y*pcv.widthInCtus + x;
99
11.0k
      x++;
100
11.0k
    }
101
7.87k
    x_in_line[y] = x;
102
        
103
7.87k
    y += 1;
104
105
7.87k
    if( y >= pcv.heightInCtus )
106
4.00k
    {
107
      // go up
108
4.00k
      if( x_in_line[y_top] >= pcv.widthInCtus )
109
4.00k
      {
110
4.00k
        y_top++;
111
4.00k
        if( y_top >= pcv.heightInCtus )
112
936
        {
113
          // done
114
936
          break;
115
936
        }
116
4.00k
      }
117
3.07k
      y = y_top;
118
3.07k
    }
119
6.94k
    x = x_in_line[y];
120
121
6.94k
    CHECK( y >= pcv.heightInCtus, "Height in CTUs is exceeded" );
122
6.94k
  }
123
5.19k
}
124
125
struct TileLineEncRsrc
126
{
127
  BitEstimator            m_BitEstimator;
128
  CABACWriter             m_CABACEstimator;
129
  BitEstimator            m_SaoBitEstimator;
130
  CABACWriter             m_SaoCABACEstimator;
131
  BitEstimator            m_AlfBitEstimator;
132
  CABACWriter             m_AlfCABACEstimator;
133
  ReuseUniMv              m_ReuseUniMv;
134
  BlkUniMvInfoBuffer      m_BlkUniMvInfoBuffer;
135
  AffineProfList          m_AffineProfList;
136
  IbcBvCand               m_CachedBvs;
137
  EncSampleAdaptiveOffset m_encSao;
138
  int                     m_prevQp[ MAX_NUM_CH ];
139
9.06k
  TileLineEncRsrc( const VVEncCfg& encCfg ) : m_CABACEstimator( m_BitEstimator ), m_SaoCABACEstimator( m_SaoBitEstimator ), m_AlfCABACEstimator( m_AlfBitEstimator ) { m_AffineProfList.init( ! encCfg.m_picReordering ); }
140
};
141
142
struct PerThreadRsrc
143
{
144
  CtxCache  m_CtxCache;
145
  EncCu     m_encCu;
146
  PelStorage m_alfTempCtuBuf;
147
};
148
149
struct CtuEncParam
150
{
151
  Picture*  pic;
152
  EncSlice* encSlice;
153
  int       ctuRsAddr;
154
  int       ctuPosX;
155
  int       ctuPosY;
156
  UnitArea  ctuArea;
157
  int       tileLineResIdx;
158
159
16.2k
  CtuEncParam() : pic( nullptr ), encSlice( nullptr ), ctuRsAddr( 0 ), ctuPosX( 0 ), ctuPosY( 0 ), ctuArea(), tileLineResIdx( 0 ) {}
160
  CtuEncParam( Picture* _p, EncSlice* _s, const int _r, const int _x, const int _y, const int _tileLineResIdx )
161
    : pic( _p )
162
    , encSlice( _s )
163
    , ctuRsAddr( _r )
164
    , ctuPosX( _x )
165
    , ctuPosY( _y )
166
    , ctuArea( pic->chromaFormat, pic->slices[0]->pps->pcv->getCtuArea( _x, _y ) )
167
0
    , tileLineResIdx( _tileLineResIdx ) {}
168
};
169
170
// ====================================================================================================================
171
// Constructor / destructor / create / destroy
172
// ====================================================================================================================
173
174
EncSlice::EncSlice()
175
5.19k
  : m_pcEncCfg           ( nullptr)
176
5.19k
  , m_threadPool         ( nullptr )
177
5.19k
  , m_ctuTasksDoneCounter( nullptr )
178
5.19k
  , m_ctuEncDelay        ( 1 )
179
5.19k
  , m_pLoopFilter        ( nullptr )
180
5.19k
  , m_pALF               ( nullptr )
181
5.19k
  , m_pcRateCtrl         ( nullptr )
182
5.19k
  , m_CABACWriter        ( m_BinEncoder )
183
5.19k
  , m_encCABACTableIdx   ( VVENC_I_SLICE )
184
5.19k
{
185
5.19k
}
186
187
188
EncSlice::~EncSlice()
189
5.19k
{
190
5.19k
  for( auto* lnRsc : m_TileLineEncRsrc )
191
9.06k
  {
192
9.06k
    delete lnRsc;
193
9.06k
  }
194
5.19k
  m_TileLineEncRsrc.clear();
195
196
5.19k
  for( auto* taskRsc: m_ThreadRsrc )
197
20.7k
  {
198
20.7k
    taskRsc->m_alfTempCtuBuf.destroy();
199
20.7k
    delete taskRsc;
200
20.7k
  }
201
5.19k
  m_ThreadRsrc.clear();
202
203
5.19k
  m_saoReconParams.clear();
204
205
21.3k
  for( int i = 0; i < m_saoStatData.size(); i++ )
206
16.2k
  {
207
64.8k
    for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ )
208
48.6k
    {
209
48.6k
      delete[] m_saoStatData[ i ][ compIdx ];
210
48.6k
    }
211
16.2k
    delete[] m_saoStatData[ i ];
212
16.2k
  }
213
5.19k
  m_saoStatData.clear();
214
5.19k
}
215
216
void EncSlice::init( const VVEncCfg& encCfg,
217
                     const SPS& sps,
218
                     const PPS& pps,
219
                     std::vector<int>* const globalCtuQpVector,
220
                     LoopFilter& loopFilter,
221
                     EncAdaptiveLoopFilter& alf,
222
                     RateCtrl& rateCtrl,
223
                     NoMallocThreadPool* threadPool,
224
                     WaitCounter* ctuTasksDoneCounter )
225
5.19k
{
226
5.19k
  m_pcEncCfg            = &encCfg;
227
5.19k
  m_pLoopFilter         = &loopFilter;
228
5.19k
  m_pALF                = &alf;
229
5.19k
  m_pcRateCtrl          = &rateCtrl;
230
5.19k
  m_threadPool          = threadPool;
231
5.19k
  m_ctuTasksDoneCounter = ctuTasksDoneCounter;
232
5.19k
  m_syncPicCtx.resize( encCfg.m_entropyCodingSyncEnabled ? pps.getNumTileLineIds() : 0 );
233
234
  
235
5.19k
  const int maxCntRscr = ( encCfg.m_numThreads > 0 ) ? pps.getNumTileLineIds() : 1;
236
5.19k
  const int maxCtuEnc  = ( encCfg.m_numThreads > 0 && threadPool ) ? threadPool->numThreads() : 1;
237
238
5.19k
  m_ThreadRsrc.resize( maxCtuEnc,  nullptr );
239
5.19k
  m_TileLineEncRsrc.resize( maxCntRscr, nullptr );
240
241
5.19k
  for( PerThreadRsrc*& taskRsc : m_ThreadRsrc )
242
20.7k
  {
243
20.7k
    taskRsc = new PerThreadRsrc();
244
20.7k
    taskRsc->m_encCu.init( encCfg,
245
20.7k
                           sps,
246
20.7k
                           globalCtuQpVector,
247
20.7k
                           m_syncPicCtx.data(),
248
20.7k
                           &rateCtrl );
249
20.7k
    taskRsc->m_alfTempCtuBuf.create( pps.pcv->chrFormat, Area( 0, 0, pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1), pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1) ), pps.pcv->maxCUSize, MAX_ALF_PADDING_SIZE, 0, false );
250
20.7k
  }
251
252
5.19k
  for( TileLineEncRsrc*& lnRsc : m_TileLineEncRsrc )
253
9.06k
  {
254
9.06k
    lnRsc = new TileLineEncRsrc( encCfg );
255
9.06k
    if( sps.saoEnabled )
256
9.06k
    {
257
9.06k
      lnRsc->m_encSao.init( encCfg );
258
9.06k
    }
259
9.06k
  }
260
261
5.19k
  const int sizeInCtus = pps.pcv->sizeInCtus;
262
5.19k
  m_processStates = std::vector<ProcessCtuState>( sizeInCtus );
263
5.19k
  m_saoReconParams.resize( sizeInCtus );
264
265
5.19k
  ::memset( m_saoDisabledRate, 0, sizeof( m_saoDisabledRate ) );
266
267
  // sao statistics
268
5.19k
  if( encCfg.m_bUseSAO )
269
5.19k
  {
270
5.19k
    m_saoStatData.resize( sizeInCtus );
271
21.3k
    for( int i = 0; i < sizeInCtus; i++ )
272
16.2k
    {
273
16.2k
      m_saoStatData[ i ] = new SAOStatData*[ MAX_NUM_COMP ];
274
64.8k
      for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ )
275
48.6k
      {
276
48.6k
        m_saoStatData[ i ][ compIdx ] = new SAOStatData[ NUM_SAO_NEW_TYPES ];
277
48.6k
      }
278
16.2k
    }
279
5.19k
  }
280
5.19k
  ctuEncParams.resize( sizeInCtus );
281
5.19k
  setArbitraryWppPattern( *pps.pcv, m_ctuAddrMap, 3 );
282
283
5.19k
  const unsigned asuHeightInCtus = m_pALF->getAsuHeightInCtus();
284
5.19k
  const unsigned numDeriveLines  = encCfg.m_ifpLines ? 
285
5.19k
    std::min( ((encCfg.m_ifpLines & (~(asuHeightInCtus - 1))) + asuHeightInCtus), pps.pcv->heightInCtus ) : pps.pcv->heightInCtus;
286
5.19k
  m_alfDeriveCtu  = numDeriveLines * pps.pcv->widthInCtus - 1;
287
5.19k
  m_ccalfDeriveCtu = encCfg.m_ifpLines ? pps.pcv->widthInCtus * std::min((unsigned)encCfg.m_ifpLines + 1, pps.pcv->heightInCtus) - 1: pps.pcv->sizeInCtus - 1;
288
5.19k
}
289
290
291
void EncSlice::initPic( Picture* pic )
292
1.29k
{
293
1.29k
  Slice* slice = pic->cs->slice;
294
295
1.29k
  if( slice->pps->numTileCols * slice->pps->numTileRows > 1 )
296
0
  {
297
0
    slice->sliceMap = slice->pps->sliceMap[0];
298
0
  }
299
1.29k
  else
300
1.29k
  {
301
1.29k
    slice->sliceMap.addCtusToSlice( 0, pic->cs->pcv->widthInCtus, 0, pic->cs->pcv->heightInCtus, pic->cs->pcv->widthInCtus);
302
1.29k
  }
303
304
  // this ensures that independently encoded bitstream chunks can be combined to bit-equal
305
1.29k
  const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx;
306
1.29k
  slice->encCABACTableIdx = cabacTableIdx;
307
308
  // set QP and lambda values
309
1.29k
  xInitSliceLambdaQP( slice );
310
311
1.29k
  for( auto* thrRsc : m_ThreadRsrc )
312
5.19k
  {
313
5.19k
    thrRsc->m_encCu.initPic( pic );
314
5.19k
  }
315
316
1.29k
  for( auto* lnRsc : m_TileLineEncRsrc )
317
2.26k
  {
318
2.26k
    lnRsc->m_ReuseUniMv.resetReusedUniMvs();
319
2.26k
  }
320
321
1.29k
  m_ctuEncDelay = 1;
322
1.29k
  if( pic->useIBC )
323
1.29k
  {
324
    // IBC needs unfiltered samples up to max IBC search range
325
    // therefore ensure that numCtuDelayLUT CTU's have been enocded first
326
    // assuming IBC localSearchRangeX / Y = 128
327
1.29k
    const int numCtuDelayLUT[ 3 ] = { 15, 3, 1 };
328
1.29k
    CHECK( pic->cs->pcv->maxCUSizeLog2 < 5 || pic->cs->pcv->maxCUSizeLog2 > 7, "invalid max CTUSize" );
329
1.29k
    m_ctuEncDelay = numCtuDelayLUT[ pic->cs->pcv->maxCUSizeLog2 - 5 ];
330
1.29k
  }
331
1.29k
}
332
333
334
335
void EncSlice::xInitSliceLambdaQP( Slice* slice )
336
1.29k
{
337
  // pre-compute lambda and QP
338
1.29k
  const bool rcp = (m_pcEncCfg->m_RCTargetBitrate > 0 && slice->pic->picInitialQP >= 0); // 2nd pass
339
1.29k
  int  iQP = Clip3 (-slice->sps->qpBDOffset[CH_L], MAX_QP, slice->pic->picInitialQP); // RC start QP
340
1.29k
  double dQP     = (rcp ? (double) slice->pic->picInitialQP : xGetQPForPicture (slice));
341
1.29k
  double dLambda = (rcp ? slice->pic->picInitialLambda : xCalculateLambda (slice, slice->TLayer, dQP, dQP, iQP));
342
1.29k
  int sliceChromaQpOffsetIntraOrPeriodic[2] = { m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[0], m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[1] };
343
1.29k
  const int lookAheadRCCQpOffset = 0;   // was (m_pcEncCfg->m_RCTargetBitrate > 0 && m_pcEncCfg->m_LookAhead && CS::isDualITree (*slice->pic->cs) ? 1 : 0);
344
1.29k
  int cbQP = 0, crQP = 0, cbCrQP = 0;
345
346
1.29k
  if (m_pcEncCfg->m_usePerceptQPA) // adapt sliceChromaQpOffsetIntraOrPeriodic and pic->ctuAdaptedQP
347
1.29k
  {
348
1.29k
    const bool cqp = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0);
349
1.29k
    const uint32_t startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
350
1.29k
    const uint32_t boundingCtuTsAddr = slice->pic->cs->pcv->sizeInCtus;
351
352
1.29k
    if ((iQP = BitAllocation::applyQPAdaptationSlice (slice, m_pcEncCfg, iQP, dLambda, &slice->pic->picVA.visAct, // updates pic->picInitialQP
353
1.29k
                                                      *m_ThreadRsrc[0]->m_encCu.getQpPtr(), m_pcRateCtrl->getIntraPQPAStats(),
354
1.29k
                                                      (slice->pps->sliceChromaQpFlag && cqp ? sliceChromaQpOffsetIntraOrPeriodic : nullptr),
355
1.29k
                                                      m_pcRateCtrl->getMinNoiseLevels(), startCtuTsAddr, boundingCtuTsAddr)) >= 0) // QP OK?
356
1.29k
    {
357
1.29k
      dLambda *= pow (2.0, ((double) iQP - dQP) / 3.0); // adjust lambda based on change of slice QP
358
1.29k
    }
359
0
    else iQP = (int) dQP; // revert to unadapted slice QP
360
1.29k
  }
361
0
  else if (rcp)
362
0
  {
363
0
    slice->pic->picInitialQP = -1; // no QPA - unused now
364
0
  }
365
366
1.29k
  if (slice->pps->sliceChromaQpFlag && CS::isDualITree (*slice->pic->cs) && !m_pcEncCfg->m_usePerceptQPA && (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity == 0))
367
0
  {
368
0
    cbQP = m_pcEncCfg->m_chromaCbQpOffsetDualTree + lookAheadRCCQpOffset; // QP offset for dual-tree
369
0
    crQP = m_pcEncCfg->m_chromaCrQpOffsetDualTree + lookAheadRCCQpOffset;
370
0
    cbCrQP = m_pcEncCfg->m_chromaCbCrQpOffsetDualTree + lookAheadRCCQpOffset;
371
0
  }
372
1.29k
  else if (slice->pps->sliceChromaQpFlag)
373
1.29k
  {
374
1.29k
    const GOPEntry &gopEntry             = *(slice->pic->gopEntry);
375
1.29k
    const bool bUseIntraOrPeriodicOffset = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0);
376
377
1.29k
    cbQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[0] : gopEntry.m_CbQPoffset) + lookAheadRCCQpOffset;
378
1.29k
    crQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[1] : gopEntry.m_CrQPoffset) + lookAheadRCCQpOffset;
379
1.29k
    cbCrQP = (cbQP + crQP) >> 1; // use floor of average CbCr chroma QP offset for joint-CbCr coding
380
381
1.29k
    cbQP = Clip3 (-12, 12, cbQP + slice->pps->chromaQpOffset[COMP_Cb]) - slice->pps->chromaQpOffset[COMP_Cb];
382
1.29k
    crQP = Clip3 (-12, 12, crQP + slice->pps->chromaQpOffset[COMP_Cr]) - slice->pps->chromaQpOffset[COMP_Cr];
383
1.29k
    cbCrQP = Clip3 (-12, 12, cbCrQP + slice->pps->chromaQpOffset[COMP_JOINT_CbCr]) - slice->pps->chromaQpOffset[COMP_JOINT_CbCr];
384
1.29k
  }
385
386
1.29k
  slice->sliceChromaQpDelta[COMP_Cb] = Clip3 (-12, 12, cbQP);
387
1.29k
  slice->sliceChromaQpDelta[COMP_Cr] = Clip3 (-12, 12, crQP);
388
1.29k
  slice->sliceChromaQpDelta[COMP_JOINT_CbCr] = (slice->sps->jointCbCr ? Clip3 (-12, 12, cbCrQP) : 0);
389
390
1.29k
  for( auto& thrRsc : m_ThreadRsrc )
391
5.19k
  {
392
5.19k
    thrRsc->m_encCu.setUpLambda( *slice, dLambda, iQP, true, true );
393
5.19k
  }
394
395
1.29k
  slice->sliceQp            = iQP;
396
1.29k
  slice->chromaQpAdjEnabled = slice->pps->chromaQpOffsetListLen > 0;
397
1.29k
}
398
399
static const int highTL[6] = { -1, 0, 0, 2, 4, 5 };
400
401
int EncSlice::xGetQPForPicture( const Slice* slice )
402
1.29k
{
403
1.29k
  const int lumaQpBDOffset = slice->sps->qpBDOffset[ CH_L ];
404
1.29k
  int qp;
405
406
1.29k
  if ( m_pcEncCfg->m_costMode == VVENC_COST_LOSSLESS_CODING )
407
0
  {
408
0
    qp = LOSSLESS_AND_MIXED_LOSSLESS_RD_COST_TEST_QP;
409
0
  }
410
1.29k
  else
411
1.29k
  {
412
1.29k
    qp = m_pcEncCfg->m_QP + slice->pic->gopAdaptedQP;
413
414
1.29k
    if (m_pcEncCfg->m_usePerceptQPA)
415
1.29k
    {
416
1.29k
      const int tlayer = slice->pic->gopEntry->m_vtl;
417
418
1.29k
      qp = (slice->isIntra() ? std::min (qp, ((qp - std::min (3, floorLog2 (m_pcEncCfg->m_GOPSize) - 4/*TODO 3 with JVET-AC0149?*/)) * 15 + 3) >> 4) : highTL[tlayer] + ((qp * (16 + std::min (2, tlayer))) >> 4) + 0/*TODO +-1?*/);
419
1.29k
    }
420
0
    else if( slice->isIntra() )
421
0
    {
422
0
      qp += m_pcEncCfg->m_intraQPOffset;
423
0
    }
424
0
    else
425
0
    {
426
0
      if( qp != -lumaQpBDOffset )
427
0
      {
428
0
        const GOPEntry &gopEntry = *(slice->pic->gopEntry);
429
        // adjust QP according to the QP offset for the GOP entry.
430
0
        qp += gopEntry.m_QPOffset;
431
432
        // adjust QP according to QPOffsetModel for the GOP entry.
433
0
        double dqpOffset = qp * gopEntry.m_QPOffsetModelScale + gopEntry.m_QPOffsetModelOffset + 0.5;
434
0
        int qpOffset = (int)floor( Clip3<double>( 0.0, 3.0, dqpOffset ) );
435
0
        qp += qpOffset;
436
0
      }
437
0
    }
438
439
1.29k
    if( m_pcEncCfg->m_blockImportanceMapping && !slice->pic->m_picShared->m_ctuBimQpOffset.empty() )
440
0
    {
441
0
      qp += slice->pic->m_picShared->m_picAuxQpOffset;
442
0
    }
443
1.29k
  }
444
1.29k
  qp = Clip3( -lumaQpBDOffset, MAX_QP, qp );
445
1.29k
  return qp;
446
1.29k
}
447
448
449
double EncSlice::xCalculateLambda( const Slice* slice,
450
                                   const int    depth, // slice GOP hierarchical depth.
451
                                   const double refQP, // initial slice-level QP
452
                                   const double dQP,   // initial double-precision QP
453
                                         int&   iQP )  // returned integer QP.
454
1.29k
{
455
1.29k
  const GOPEntry &gopEntry = *(slice->pic->gopEntry);
456
1.29k
  const int SHIFT_QP       = 12;
457
1.29k
  const int temporalId     = gopEntry.m_temporalId;
458
1.29k
  std::vector<double> intraLambdaModifiers;
459
1.29k
  for ( int i = 0; i < VVENC_MAX_TLAYER; i++ )
460
1.29k
  {
461
1.29k
    if( m_pcEncCfg->m_adIntraLambdaModifier[i] != 0.0 ) intraLambdaModifiers.push_back( m_pcEncCfg->m_adIntraLambdaModifier[i] );
462
1.29k
    else break;
463
1.29k
  }
464
465
1.29k
  int bitdepth_luma_qp_scale = 6
466
1.29k
                               * (slice->sps->bitDepths[ CH_L ] - 8
467
1.29k
                                  - DISTORTION_PRECISION_ADJUSTMENT(slice->sps->bitDepths[ CH_L ]));
468
1.29k
  double qp_temp = dQP + bitdepth_luma_qp_scale - SHIFT_QP;
469
  // Case #1: I or P-slices (key-frame)
470
1.29k
  double dQPFactor = gopEntry.m_QPFactor;
471
1.29k
  if( slice->sliceType == VVENC_I_SLICE )
472
1.29k
  {
473
1.29k
    if (m_pcEncCfg->m_dIntraQpFactor>=0.0 && gopEntry.m_sliceType != 'I')
474
0
    {
475
0
      dQPFactor = m_pcEncCfg->m_dIntraQpFactor;
476
0
    }
477
1.29k
    else
478
1.29k
    {
479
1.29k
      dQPFactor = 0.57;
480
1.29k
      if( ! m_pcEncCfg->m_lambdaFromQPEnable )
481
0
      {
482
0
        const int NumberBFrames = ( m_pcEncCfg->m_GOPSize - 1 );
483
0
        const double dLambda_scale = 1.0 - Clip3( 0.0, 0.5, 0.05 * (double)NumberBFrames );
484
0
        dQPFactor *= dLambda_scale;
485
0
      }
486
1.29k
    }
487
1.29k
  }
488
0
  else if( m_pcEncCfg->m_lambdaFromQPEnable )
489
0
  {
490
0
    dQPFactor=0.57;
491
0
  }
492
493
1.29k
  double dLambda = dQPFactor*pow( 2.0, qp_temp/3.0 );
494
495
1.29k
  if( !(m_pcEncCfg->m_lambdaFromQPEnable) && depth>0 )
496
0
  {
497
0
    double qp_temp_ref = refQP + bitdepth_luma_qp_scale - SHIFT_QP;
498
0
    dLambda *= Clip3(2.00, 4.00, (qp_temp_ref / 6.0));   // (j == B_SLICE && p_cur_frm->layer != 0 )
499
0
  }
500
501
  // if hadamard is used in ME process
502
1.29k
  if ( !m_pcEncCfg->m_bUseHADME && slice->sliceType != VVENC_I_SLICE )
503
0
  {
504
0
    dLambda *= 0.95;
505
0
  }
506
507
1.29k
  double lambdaModifier;
508
1.29k
  if( slice->sliceType != VVENC_I_SLICE || intraLambdaModifiers.empty())
509
1.29k
  {
510
1.29k
    lambdaModifier = m_pcEncCfg->m_adLambdaModifier[ temporalId ];
511
1.29k
  }
512
0
  else
513
0
  {
514
0
    lambdaModifier = intraLambdaModifiers[ (temporalId < intraLambdaModifiers.size()) ? temporalId : (intraLambdaModifiers.size()-1) ];
515
0
  }
516
1.29k
  dLambda *= lambdaModifier;
517
518
1.29k
  iQP = Clip3( -slice->sps->qpBDOffset[ CH_L ], MAX_QP, (int) floor( dQP + 0.5 ) );
519
520
1.29k
  if( m_pcEncCfg->m_DepQuantEnabled )
521
1.29k
  {
522
1.29k
    dLambda *= pow( 2.0, 0.25/3.0 ); // slight lambda adjustment for dependent quantization (due to different slope of quantizer)
523
1.29k
  }
524
525
  // NOTE: the lambda modifiers that are sometimes applied later might be best always applied in here.
526
1.29k
  return dLambda;
527
1.29k
}
528
529
530
// ====================================================================================================================
531
// Public member functions
532
// ====================================================================================================================
533
534
535
/** \param pic   picture class
536
 */
537
void EncSlice::compressSlice( Picture* pic )
538
1.29k
{
539
1.29k
  PROFILER_SCOPE_AND_STAGE( 1, g_timeProfiler, P_COMPRESS_SLICE );
540
1.29k
  CodingStructure& cs         = *pic->cs;
541
1.29k
  Slice* const slice          = cs.slice;
542
1.29k
  uint32_t  startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
543
1.29k
  uint32_t  boundingCtuTsAddr = pic->cs->pcv->sizeInCtus;
544
545
1.29k
  cs.pcv      = slice->pps->pcv;
546
1.29k
  cs.fracBits = 0;
547
548
1.29k
  if( startCtuTsAddr == 0 )
549
1.29k
  {
550
1.29k
    cs.initStructData( slice->sliceQp );
551
1.29k
  }
552
553
1.29k
  for( auto* thrRsrc : m_ThreadRsrc )
554
5.19k
  {
555
5.19k
    thrRsrc->m_encCu.initSlice( slice );
556
5.19k
  }
557
558
1.29k
  for( auto* lnRsrc : m_TileLineEncRsrc )
559
2.26k
  {
560
2.26k
    lnRsrc->m_CABACEstimator    .initCtxModels( *slice );
561
2.26k
    lnRsrc->m_SaoCABACEstimator .initCtxModels( *slice );
562
2.26k
    lnRsrc->m_AlfCABACEstimator .initCtxModels( *slice );
563
2.26k
    lnRsrc->m_AffineProfList    .resetAffineMVList();
564
2.26k
    lnRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
565
2.26k
    lnRsrc->m_CachedBvs         .resetIbcBvCand();
566
567
2.26k
    if( slice->sps->saoEnabled && pic->useSAO )
568
2.26k
    {
569
2.26k
      lnRsrc->m_encSao          .initSlice( slice );
570
2.26k
    }
571
2.26k
  }
572
573
1.29k
  if( slice->sps->fpelMmvd && !slice->picHeader->disFracMMVD )
574
1.29k
  {
575
1.29k
    slice->picHeader->disFracMMVD = ( pic->lwidth() * pic->lheight() > 1920 * 1080 ) ? true : false;
576
1.29k
  }
577
578
1.29k
  xProcessCtus( pic, startCtuTsAddr, boundingCtuTsAddr );
579
1.29k
}
580
581
void setJointCbCrModes( CodingStructure& cs, const Position topLeftLuma, const Size sizeLuma )
582
1.29k
{
583
1.29k
  bool              sgnFlag = true;
584
585
1.29k
  if( isChromaEnabled( cs.picture->chromaFormat) )
586
1.29k
  {
587
1.29k
    const CompArea  cbArea  = CompArea( COMP_Cb, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true );
588
1.29k
    const CompArea  crArea  = CompArea( COMP_Cr, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true );
589
590
1.29k
    const CPelBuf   orgCb   = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( cbArea ): cs.picture->getOrigBuf( cbArea );
591
1.29k
    const CPelBuf   orgCr   = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( crArea ): cs.picture->getOrigBuf( crArea );
592
1.29k
    const int       x0      = ( cbArea.x > 0 ? 0 : 1 );
593
1.29k
    const int       y0      = ( cbArea.y > 0 ? 0 : 1 );
594
1.29k
    const int       x1      = ( cbArea.x + cbArea.width  < cs.picture->Cb().width  ? cbArea.width  : cbArea.width  - 1 );
595
1.29k
    const int       y1      = ( cbArea.y + cbArea.height < cs.picture->Cb().height ? cbArea.height : cbArea.height - 1 );
596
1.29k
    const int       cbs     = orgCb.stride;
597
1.29k
    const int       crs     = orgCr.stride;
598
1.29k
    const Pel*      pCb     = orgCb.buf + y0 * cbs;
599
1.29k
    const Pel*      pCr     = orgCr.buf + y0 * crs;
600
1.29k
    int64_t         sumCbCr = 0;
601
602
    // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes
603
97.4k
    for( int y = y0; y < y1; y++, pCb += cbs, pCr += crs )
604
96.1k
    {
605
7.68M
      for( int x = x0; x < x1; x++ )
606
7.59M
      {
607
7.59M
        int cb = ( 12*(int)pCb[x] - 2*((int)pCb[x-1] + (int)pCb[x+1] + (int)pCb[x-cbs] + (int)pCb[x+cbs]) - ((int)pCb[x-1-cbs] + (int)pCb[x+1-cbs] + (int)pCb[x-1+cbs] + (int)pCb[x+1+cbs]) );
608
7.59M
        int cr = ( 12*(int)pCr[x] - 2*((int)pCr[x-1] + (int)pCr[x+1] + (int)pCr[x-crs] + (int)pCr[x+crs]) - ((int)pCr[x-1-crs] + (int)pCr[x+1-crs] + (int)pCr[x-1+crs] + (int)pCr[x+1+crs]) );
609
7.59M
        sumCbCr += cb*cr;
610
7.59M
      }
611
96.1k
    }
612
613
1.29k
    sgnFlag = ( sumCbCr < 0 );
614
1.29k
  }
615
616
1.29k
  cs.slice->picHeader->jointCbCrSign = sgnFlag;
617
1.29k
}
618
619
struct CtuPos
620
{
621
  const int ctuPosX;
622
  const int ctuPosY;
623
  const int ctuRsAddr;
624
625
4.05k
  CtuPos( int _x, int _y, int _a ) : ctuPosX( _x ), ctuPosY( _y ), ctuRsAddr( _a ) {}
626
};
627
628
class CtuTsIterator
629
{
630
  private:
631
    const CodingStructure& cs;
632
    const int        m_startTsAddr;
633
    const int        m_endTsAddr;
634
    std::vector<int> m_ctuAddrMap;
635
          int        m_ctuTsAddr;
636
637
  private:
638
    int getNextTsAddr( const int _tsAddr ) const
639
4.05k
    {
640
4.05k
      const PreCalcValues& pcv  = *cs.pcv;
641
4.05k
      const int startSliceRsRow = m_startTsAddr / pcv.widthInCtus;
642
4.05k
      const int startSliceRsCol = m_startTsAddr % pcv.widthInCtus;
643
4.05k
      const int endSliceRsRow   = (m_endTsAddr - 1) / pcv.widthInCtus;
644
4.05k
      const int endSliceRsCol   = (m_endTsAddr - 1) % pcv.widthInCtus;
645
4.05k
            int ctuTsAddr = _tsAddr;
646
4.05k
      CHECK( ctuTsAddr > m_endTsAddr, "error: array index out of bounds" );
647
5.34k
      while( ctuTsAddr < m_endTsAddr )
648
4.05k
      {
649
4.05k
        ctuTsAddr++;
650
4.05k
        const int ctuRsAddr = ctuTsAddr; 
651
4.05k
        if( cs.slice->pps->rectSlice
652
4.05k
            && ( (ctuRsAddr / pcv.widthInCtus) < startSliceRsRow
653
4.05k
              || (ctuRsAddr / pcv.widthInCtus) > endSliceRsRow
654
2.75k
              || (ctuRsAddr % pcv.widthInCtus) < startSliceRsCol
655
2.75k
              || (ctuRsAddr % pcv.widthInCtus) > endSliceRsCol ) )
656
1.29k
          continue;
657
2.75k
        break;
658
4.05k
      }
659
4.05k
      return ctuTsAddr;
660
4.05k
    }
661
662
    int mapAddr( const int _addr ) const
663
4.05k
    {
664
4.05k
      if( _addr < 0 )
665
0
        return _addr;
666
4.05k
      if( _addr >= m_ctuAddrMap.size() )
667
0
        return _addr;
668
4.05k
      return m_ctuAddrMap[ _addr ];
669
4.05k
    }
670
671
  public:
672
1.29k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e,       std::vector<int>& _m         ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {}
673
0
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, bool _wpp                          ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ),                     m_ctuTsAddr( _s ) { if( _wpp ) setWppPattern(); }
674
0
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m         ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {}
675
1.29k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m, int _c ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( std::max( _s, _c ) ) {}
676
1.29k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>* _m, bool _wpp ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuTsAddr( _s ) {  if( _wpp ) m_ctuAddrMap = *_m;  }
677
678
9.24k
    virtual ~CtuTsIterator() { m_ctuAddrMap.clear(); }
679
680
4.05k
    CtuTsIterator& operator++()                { m_ctuTsAddr = getNextTsAddr( m_ctuTsAddr ); return *this; }
681
0
    CtuTsIterator  operator++(int)             { auto retval = *this; ++(*this); return retval; }
682
0
    bool operator==(CtuTsIterator other) const { return m_ctuTsAddr == other.m_ctuTsAddr; }
683
5.34k
    bool operator!=(CtuTsIterator other) const { return m_ctuTsAddr != other.m_ctuTsAddr; }
684
4.05k
    CtuPos operator*()                   const { const int ctuRsAddr = mapAddr( m_ctuTsAddr );  return CtuPos( ctuRsAddr % cs.pcv->widthInCtus, ctuRsAddr / cs.pcv->widthInCtus, ctuRsAddr ); }
685
686
1.29k
    CtuTsIterator begin() { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap ); };
687
1.29k
    CtuTsIterator end()   { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap, m_endTsAddr ); };
688
689
    using iterator_category = std::forward_iterator_tag;
690
    using value_type        = int;
691
    using pointer           = int*;
692
    using reference         = int&;
693
    using difference_type   = ptrdiff_t;
694
695
    void setWppPattern()
696
0
    {
697
0
      const PreCalcValues& pcv = *cs.pcv;
698
0
      m_ctuAddrMap.resize( pcv.sizeInCtus, 0 );
699
0
      int addr = 0;
700
0
      for( int i = 1; i < pcv.sizeInCtus; i++ )
701
0
      {
702
0
        int x = addr % pcv.widthInCtus;
703
0
        int y = addr / pcv.widthInCtus;
704
0
        x -= 1;
705
0
        y += 1;
706
0
        if( x < 0 || y >= pcv.heightInCtus )
707
0
        {
708
0
          x += 1 + y;
709
0
          y  = 0;
710
0
        }
711
0
        if( x >= pcv.widthInCtus )
712
0
        {
713
0
          y += ( x - pcv.widthInCtus ) + 1;
714
0
          x  = pcv.widthInCtus - 1;
715
0
        }
716
0
        addr = y * pcv.widthInCtus + x;
717
0
        m_ctuAddrMap[ i ] = addr;
718
0
      }
719
0
    }
720
};
721
722
void EncSlice::saoDisabledRate( CodingStructure& cs, SAOBlkParam* reconParams )
723
0
{
724
0
  EncSampleAdaptiveOffset::disabledRate( cs, m_saoDisabledRate, reconParams, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
725
0
}
726
727
void EncSlice::finishCompressSlice( Picture* pic, Slice& slice )
728
1.29k
{
729
1.29k
  CodingStructure& cs = *pic->cs;
730
731
  // finalize
732
1.29k
  if( slice.sps->saoEnabled && pic->useSAO )
733
1.29k
  {
734
    // store disabled statistics
735
1.29k
    if( !m_pcEncCfg->m_numThreads )
736
0
      saoDisabledRate( cs, &m_saoReconParams[ 0 ] );
737
738
    // set slice header flags
739
1.29k
    CHECK( m_saoEnabled[ COMP_Cb ] != m_saoEnabled[ COMP_Cr ], "Unspecified error");
740
1.29k
    for( auto s : pic->slices )
741
1.29k
    {
742
1.29k
      s->saoEnabled[ CH_L ] = m_saoEnabled[ COMP_Y  ];
743
1.29k
      s->saoEnabled[ CH_C ] = m_saoEnabled[ COMP_Cb ];
744
1.29k
    }
745
1.29k
  }
746
1.29k
}
747
748
void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const unsigned boundingCtuTsAddr )
749
1.29k
{
750
1.29k
  PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs );
751
1.29k
  CodingStructure& cs      = *pic->cs;
752
1.29k
  Slice&           slice   = *cs.slice;
753
1.29k
  const PreCalcValues& pcv = *cs.pcv;
754
755
  // initialization
756
1.29k
  if( slice.sps->jointCbCr )
757
1.29k
  {
758
1.29k
    setJointCbCrModes( cs, Position(0, 0), cs.area.lumaSize() );
759
1.29k
  }
760
761
1.29k
  if( slice.sps->saoEnabled && pic->useSAO )
762
1.29k
  {
763
    // check SAO enabled or disabled
764
1.29k
    EncSampleAdaptiveOffset::decidePicParams( cs, m_saoDisabledRate, m_saoEnabled, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
765
766
1.29k
    m_saoAllDisabled = true;
767
5.19k
    for( int compIdx = 0; compIdx < getNumberValidComponents( pcv.chrFormat ); compIdx++ )
768
3.89k
    {
769
3.89k
      m_saoAllDisabled &= ! m_saoEnabled[ compIdx ];
770
3.89k
    }
771
772
1.29k
    std::fill( m_saoReconParams.begin(), m_saoReconParams.end(), SAOBlkParam() );
773
1.29k
  }
774
0
  else
775
0
  {
776
0
    m_saoAllDisabled = true;
777
0
  }
778
779
1.29k
  if( slice.sps->alfEnabled )
780
1.29k
  {
781
1.29k
    m_pALF->initEncProcess( slice );
782
1.29k
  }
783
784
1.29k
  std::fill( m_processStates.begin(), m_processStates.end(), CTU_ENCODE );
785
786
  // fill encoder parameter list
787
1.29k
  int idx = 0;
788
1.29k
  const std::vector<int> base = slice.sliceMap.ctuAddrInSlice;
789
1.29k
  auto ctuIter = CtuTsIterator( cs, startCtuTsAddr, boundingCtuTsAddr, &m_ctuAddrMap, m_pcEncCfg->m_numThreads > 0 );
790
1.29k
  for( auto ctuPos : ctuIter )
791
4.05k
  {
792
4.05k
    ctuEncParams[ idx ].pic       = pic;
793
4.05k
    ctuEncParams[ idx ].encSlice  = this;
794
4.05k
    ctuEncParams[ idx ].ctuRsAddr = ctuPos.ctuRsAddr;
795
4.05k
    ctuEncParams[ idx ].ctuPosX   = ctuPos.ctuPosX;
796
4.05k
    ctuEncParams[ idx ].ctuPosY   = ctuPos.ctuPosY;
797
4.05k
    ctuEncParams[ idx ].ctuArea   = UnitArea( pic->chromaFormat, slice.pps->pcv->getCtuArea( ctuPos.ctuPosX, ctuPos.ctuPosY ) );
798
799
4.05k
    if( m_pcEncCfg->m_numThreads > 0 )
800
4.05k
    {
801
4.05k
      ctuEncParams[idx].tileLineResIdx = slice.pps->getTileLineId( ctuPos.ctuPosX, ctuPos.ctuPosY );
802
4.05k
    }
803
0
    else
804
0
    {
805
0
      ctuEncParams[idx].tileLineResIdx = 0;
806
0
    }
807
4.05k
    idx++;
808
4.05k
  }
809
810
  //for( int i = 0; i < idx; i++ )
811
  //{
812
  //  for( int j = i; j < idx; j++ )
813
  //  {
814
  //    if( ctuEncParams[i].tileLineResIdx != ctuEncParams[j].tileLineResIdx ) continue;
815
  //
816
  //    CHECK( ctuEncParams[i].ctuPosY != ctuEncParams[j].ctuPosY, "Not the same CTU line!" );
817
  //    CHECK( slice.pps->getTileIdx( ctuEncParams[i].ctuPosX, ctuEncParams[i].ctuPosY ) != slice.pps->getTileIdx( ctuEncParams[j].ctuPosX, ctuEncParams[j].ctuPosY ), "Not the same tile!" );
818
  //  }
819
  //}
820
821
1.29k
  CHECK( idx != pcv.sizeInCtus, "array index out of bounds" );
822
823
  // process ctu's until last ctu is done
824
1.29k
  if( m_pcEncCfg->m_numThreads > 0 )
825
1.29k
  {
826
1.29k
    for( auto& ctuEncParam : ctuEncParams )
827
4.05k
    {
828
4.05k
      m_threadPool->addBarrierTask( EncSlice::xProcessCtuTask<false>,
829
4.05k
                                    &ctuEncParam,
830
4.05k
                                    m_ctuTasksDoneCounter,
831
4.05k
                                    nullptr,
832
4.05k
                                    {},
833
4.05k
                                    EncSlice::xProcessCtuTask<true> );
834
4.05k
    }
835
1.29k
  }
836
0
  else
837
0
  {
838
0
    do
839
0
    {
840
0
      for( auto& ctuEncParam : ctuEncParams )
841
0
      {
842
0
        if( m_processStates[ctuEncParam.ctuRsAddr] != PROCESS_DONE )
843
0
          EncSlice::xProcessCtuTask<false>( 0, &ctuEncParam );
844
0
      }
845
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_LUMA_LF,   cs, cs.getRecoBuf(), COMP_Y  );
846
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cb );
847
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cr );
848
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_LUMA_SAO,   cs, cs.getRecoBuf(), COMP_Y  );
849
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cb );
850
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cr );
851
0
    }
852
0
    while( m_processStates[ boundingCtuTsAddr - 1 ] != PROCESS_DONE );
853
0
  }
854
1.29k
}
855
856
inline bool checkCtuTaskNbTop( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
857
835k
{
858
835k
  return ctuPosY > 0 && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus ] <= tskType;
859
835k
}
860
861
inline bool checkCtuTaskNbBot( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
862
313k
{
863
313k
  return ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, 1 ) ) && processStates[ ctuRsAddr     + pps.pcv->widthInCtus ] <= tskType;
864
313k
}
865
866
inline bool checkCtuTaskNbRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
867
665k
{
868
665k
  return ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, 0 ) ) && processStates[ ctuRsAddr + 1 ] <= tskType;
869
665k
}
870
871
inline bool checkCtuTaskNbTopRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
872
259k
{
873
259k
  return ctuPosY > 0 && ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus + 1 ] <= tskType;
874
259k
}
875
876
inline bool checkCtuTaskNbBotRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, const int rightOffset = 1, bool override = false )
877
8.97M
{
878
8.97M
  return ctuPosX + rightOffset < pps.pcv->widthInCtus && ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, rightOffset, 1 ) ) && processStates[ ctuRsAddr + rightOffset + pps.pcv->widthInCtus ] <= tskType;
879
8.97M
}
880
881
template<bool checkReadyState>
882
bool EncSlice::xProcessCtuTask( int threadIdx, void* taskParam )
883
131M
{
884
131M
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
131M
  Picture* pic                   = ctuEncParam->pic;
886
131M
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
131M
  CodingStructure& cs            = *pic->cs;
888
131M
  Slice&           slice         = *cs.slice;
889
131M
  const PPS&       pps           = *slice.pps;
890
131M
  const PreCalcValues& pcv       = *cs.pcv;
891
131M
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
131M
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
131M
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
131M
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
131M
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
131M
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
131M
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
131M
  const int ctuStride            = pcv.widthInCtus;
899
131M
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
131M
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
131M
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
131M
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
131M
  const TaskType currState       = processStates[ ctuRsAddr ];
904
131M
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
131M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
131M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
131M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
131M
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
131M
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
131M
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
71.8M
    return false;
916
917
59.2M
  switch( currState )
918
59.2M
  {
919
    // encode
920
28.5M
    case CTU_ENCODE:
921
28.5M
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
28.5M
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
28.5M
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
28.5M
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
23.3M
          return false;
937
5.17M
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
5.16M
          return false;
939
        
940
8.07k
        if( checkReadyState )
941
4.04k
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
4.03k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
4.03k
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
4.03k
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
4.03k
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
4.03k
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
4.03k
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
4.05k
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
2.26k
        {
960
2.26k
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
2.26k
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
2.26k
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
2.26k
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
2.26k
        }
965
966
4.03k
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
4.03k
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
4.03k
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
4.03k
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
17.5M
    case RESHAPE_LF_VER:
975
17.5M
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
17.5M
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
17.5M
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
17.5M
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
17.5M
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
17.5M
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
26.4M
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
17.5M
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
8.56M
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
8.96M
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
8.96M
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
8.95M
          return false;
1005
1006
8.02k
        if( checkReadyState )
1007
4.04k
          return true;
1008
1009
3.97k
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
3.97k
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
3.97k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
4.04k
        {
1023
4.04k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
4.04k
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
4.04k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
4.04k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
4.04k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
4.04k
        }
1032
1033
3.97k
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
3.97k
        processStates[ ctuRsAddr ] = LF_HOR;
1036
3.97k
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
507k
    case LF_HOR:
1041
507k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
507k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
168k
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
339k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
331k
          return false;
1051
1052
8.05k
        if( checkReadyState )
1053
4.05k
          return true;
1054
1055
4.00k
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
4.00k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
4.05k
        {
1059
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
4.05k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
4.05k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
4.05k
        }
1064
1065
4.00k
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
4.00k
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
4.00k
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
318k
    case SAO_FILTER:
1073
318k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
318k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
259k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
234k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
217k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
11.3k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
8.08k
        if( checkReadyState )
1086
4.05k
          return true;
1087
1088
4.03k
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
4.05k
        if( slice.sps->saoEnabled && pic->useSAO )
1092
4.05k
        {
1093
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
4.05k
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
4.05k
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
4.05k
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
4.05k
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
4.05k
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
4.05k
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
4.05k
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
4.05k
        }
1104
1105
        // ALF border extension
1106
4.03k
        if( cs.sps->alfEnabled )
1107
4.05k
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
4.05k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
4.05k
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
4.05k
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
4.05k
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
4.05k
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
4.05k
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
4.05k
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
4.05k
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
4.05k
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
4.05k
        }
1122
1123
        // DMVR refinement can be stored now
1124
4.05k
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
4.05k
        {
1126
4.05k
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
4.05k
        }
1128
4.03k
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
4.03k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
4.03k
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
4.03k
        if( ctuPosX == lastCtuColInTileRow )
1133
2.26k
        {
1134
2.26k
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
2.26k
        }
1136
1.77k
        else
1137
1.77k
        {
1138
1.77k
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
1.77k
          return true;
1140
1.77k
        }
1141
4.03k
      }
1142
2.26k
      break;
1143
1144
90.7k
    case ALF_GET_STATISTICS:
1145
90.7k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
90.7k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
90.7k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
4.52k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
4.52k
        if( checkReadyState )
1153
2.26k
          return true;
1154
1155
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
2.26k
        if( slice.sps->alfEnabled )
1159
2.26k
        {
1160
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
2.26k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
4.05k
          {
1165
4.05k
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
4.05k
          }
1167
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
2.26k
        }
1169
1170
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
2.26k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
2.26k
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
2.26k
      }
1176
0
      break;
1177
1178
1.98M
    case ALF_DERIVE_FILTER:
1179
1.98M
      {
1180
1.98M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.98M
        if( ctuRsAddr == deriveFilterCtu )
1182
1.98M
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.98M
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
2.00M
          for( int y = 0; y < numCheckLines; y++ )
1186
2.00M
          {
1187
2.02M
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
2.00M
            {
1189
2.00M
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
2.00M
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
1.98M
                return false;
1192
2.00M
            }
1193
2.00M
          }
1194
1.98M
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
2.59k
        if( checkReadyState )
1202
1.29k
          return true;
1203
1204
1.29k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
1.29k
        if( slice.sps->alfEnabled )
1207
1.29k
        {
1208
1.29k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
1.29k
          if( ctuRsAddr == deriveFilterCtu )
1210
1.29k
          {
1211
1.29k
            encSlice->m_pALF->initDerivation( slice );
1212
1.29k
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
1.29k
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
1.29k
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
1.29k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
1.29k
        }
1228
1229
1.29k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
1.29k
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
1.29k
      }
1232
0
      break;
1233
1234
10.0M
    case ALF_RECONSTRUCT:
1235
10.0M
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
10.0M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
10.0M
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
10.0M
          return false;
1240
4.52k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
4.52k
        if( checkReadyState )
1249
2.26k
          return true;
1250
1251
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
2.26k
        if( slice.sps->alfEnabled )
1254
2.26k
        {
1255
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
4.05k
          {
1259
4.05k
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
4.05k
          }
1261
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
2.26k
        }
1263
1264
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
2.26k
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
2.26k
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
8.82k
    case CCALF_GET_STATISTICS:
1271
8.82k
      {
1272
8.82k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
4.95k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
3.08k
        if( checkReadyState )
1276
824
          return true;
1277
1278
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
2.26k
        if( slice.sps->ccalfEnabled )
1282
2.26k
        {
1283
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
4.05k
          {
1287
4.05k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
4.05k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
4.05k
          }
1290
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
2.26k
        }
1292
1293
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
2.26k
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
2.26k
      }
1298
0
      break;
1299
1300
209k
    case CCALF_DERIVE_FILTER:
1301
209k
      {
1302
        // synchronization dependencies
1303
209k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
209k
        if( ctuRsAddr == deriveFilterCtu )
1305
209k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
209k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
218k
          for( int y = 0; y < numCheckLines; y++ )
1309
216k
          {
1310
225k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
216k
            {
1312
216k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
216k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
207k
                return false;
1315
216k
            }
1316
216k
          }
1317
209k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
2.59k
        if( checkReadyState )
1325
1.29k
          return true;
1326
1327
1.29k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
1.29k
        if( slice.sps->ccalfEnabled )
1331
1.29k
        {
1332
1.29k
          if( ctuRsAddr == deriveFilterCtu )
1333
1.29k
          {
1334
1.29k
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
1.29k
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
1.29k
        }
1346
1.29k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
1.29k
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
1.29k
      }
1350
0
      break;
1351
1352
13.5k
    case CCALF_RECONSTRUCT:
1353
13.5k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
13.5k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
13.5k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
9.05k
          return false;
1358
1359
4.53k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
4.53k
        if( checkReadyState )
1368
2.26k
          return true;
1369
1370
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
2.26k
        if( slice.sps->ccalfEnabled )
1373
2.26k
        {
1374
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
4.04k
          {
1377
4.04k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
4.04k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
4.04k
          }
1380
2.26k
        }
1381
1382
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
2.26k
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
2.26k
        {
1388
2.26k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
2.26k
          const int margin = cs.picture->margin;
1390
2.26k
          recoBuf.extendBorderPelLft( y, height, margin );
1391
2.26k
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
2.26k
          if(ctuPosY == 0)
1393
1.29k
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
2.26k
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
1.29k
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
2.26k
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
2.26k
        }
1401
1402
        // perform finish only once for whole picture
1403
2.26k
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
2.26k
        if( ctuRsAddr < finishCtu )
1405
967
        {
1406
967
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
967
          return true;
1409
967
        }
1410
1.29k
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
1.29k
      }
1412
1413
19.8k
    case FINISH_SLICE:
1414
19.8k
      {
1415
19.8k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
43.2k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
41.2k
          if( processStates[ i ] < FINISH_SLICE )
1420
17.9k
            return false;
1421
1422
1.94k
        if( checkReadyState )
1423
647
          return true;
1424
1425
1.29k
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
1.29k
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
1.29k
        return true;
1430
1.94k
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
59.2M
  }
1440
1441
21.5k
  return false;
1442
59.2M
}
bool vvenc::EncSlice::xProcessCtuTask<false>(int, void*)
Line
Count
Source
883
27.0k
{
884
27.0k
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
27.0k
  Picture* pic                   = ctuEncParam->pic;
886
27.0k
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
27.0k
  CodingStructure& cs            = *pic->cs;
888
27.0k
  Slice&           slice         = *cs.slice;
889
27.0k
  const PPS&       pps           = *slice.pps;
890
27.0k
  const PreCalcValues& pcv       = *cs.pcv;
891
27.0k
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
27.0k
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
27.0k
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
27.0k
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
27.0k
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
27.0k
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
27.0k
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
27.0k
  const int ctuStride            = pcv.widthInCtus;
899
27.0k
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
27.0k
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
27.0k
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
27.0k
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
27.0k
  const TaskType currState       = processStates[ ctuRsAddr ];
904
27.0k
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
27.0k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
27.0k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
27.0k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
27.0k
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
27.0k
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
27.0k
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
0
    return false;
916
917
27.0k
  switch( currState )
918
27.0k
  {
919
    // encode
920
4.05k
    case CTU_ENCODE:
921
4.05k
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
4.05k
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
4.05k
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
4.05k
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
0
          return false;
937
4.05k
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
0
          return false;
939
        
940
4.05k
        if( checkReadyState )
941
0
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
4.05k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
4.05k
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
4.05k
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
4.05k
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
4.05k
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
4.05k
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
4.05k
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
2.26k
        {
960
2.26k
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
2.26k
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
2.26k
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
2.26k
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
2.26k
        }
965
966
4.05k
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
4.05k
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
4.05k
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
4.05k
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
4.05k
    case RESHAPE_LF_VER:
975
4.05k
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
4.05k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
4.05k
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
4.05k
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
4.05k
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
4.05k
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
8.10k
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
4.05k
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
0
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
4.05k
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
4.05k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
0
          return false;
1005
1006
4.05k
        if( checkReadyState )
1007
0
          return true;
1008
1009
4.05k
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
4.05k
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
4.05k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
4.04k
        {
1023
4.04k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
4.04k
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
4.04k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
4.04k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
4.04k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
4.04k
        }
1032
1033
4.05k
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
4.05k
        processStates[ ctuRsAddr ] = LF_HOR;
1036
4.05k
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
4.05k
    case LF_HOR:
1041
4.05k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
4.05k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
0
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
4.05k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
0
          return false;
1051
1052
4.05k
        if( checkReadyState )
1053
0
          return true;
1054
1055
4.05k
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
4.05k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
4.05k
        {
1059
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
4.05k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
4.05k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
4.05k
        }
1064
1065
4.05k
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
4.05k
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
4.05k
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
4.05k
    case SAO_FILTER:
1073
4.05k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
4.05k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
4.05k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
4.05k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
4.05k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
4.05k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
4.05k
        if( checkReadyState )
1086
0
          return true;
1087
1088
4.05k
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
4.05k
        if( slice.sps->saoEnabled && pic->useSAO )
1092
4.05k
        {
1093
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
4.05k
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
4.05k
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
4.05k
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
4.05k
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
4.05k
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
4.05k
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
4.05k
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
4.05k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
4.05k
        }
1104
1105
        // ALF border extension
1106
4.05k
        if( cs.sps->alfEnabled )
1107
4.05k
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
4.05k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
4.05k
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
4.05k
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
4.05k
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
4.05k
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
4.05k
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
4.05k
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
4.05k
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
4.05k
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
4.05k
        }
1122
1123
        // DMVR refinement can be stored now
1124
4.05k
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
4.05k
        {
1126
4.05k
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
4.05k
        }
1128
4.05k
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
4.05k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
4.05k
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
4.05k
        if( ctuPosX == lastCtuColInTileRow )
1133
2.26k
        {
1134
2.26k
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
2.26k
        }
1136
1.78k
        else
1137
1.78k
        {
1138
1.78k
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
1.78k
          return true;
1140
1.78k
        }
1141
4.05k
      }
1142
2.26k
      break;
1143
1144
2.26k
    case ALF_GET_STATISTICS:
1145
2.26k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
2.26k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
2.26k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
2.26k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
2.26k
        if( checkReadyState )
1153
0
          return true;
1154
1155
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
2.26k
        if( slice.sps->alfEnabled )
1159
2.26k
        {
1160
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
2.26k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
4.05k
          {
1165
4.05k
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
4.05k
          }
1167
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
2.26k
        }
1169
1170
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
2.26k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
2.26k
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
2.26k
      }
1176
0
      break;
1177
1178
1.29k
    case ALF_DERIVE_FILTER:
1179
1.29k
      {
1180
1.29k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.29k
        if( ctuRsAddr == deriveFilterCtu )
1182
1.29k
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.29k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
3.56k
          for( int y = 0; y < numCheckLines; y++ )
1186
2.26k
          {
1187
4.53k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
2.26k
            {
1189
2.26k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
2.26k
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
0
                return false;
1192
2.26k
            }
1193
2.26k
          }
1194
1.29k
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
1.29k
        if( checkReadyState )
1202
0
          return true;
1203
1204
1.29k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
1.29k
        if( slice.sps->alfEnabled )
1207
1.29k
        {
1208
1.29k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
1.29k
          if( ctuRsAddr == deriveFilterCtu )
1210
1.29k
          {
1211
1.29k
            encSlice->m_pALF->initDerivation( slice );
1212
1.29k
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
1.29k
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
1.29k
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
1.29k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
1.29k
        }
1228
1229
1.29k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
1.29k
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
1.29k
      }
1232
0
      break;
1233
1234
2.26k
    case ALF_RECONSTRUCT:
1235
2.26k
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
2.26k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
2.26k
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
0
          return false;
1240
2.26k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
2.26k
        if( checkReadyState )
1249
0
          return true;
1250
1251
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
2.26k
        if( slice.sps->alfEnabled )
1254
2.26k
        {
1255
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
4.05k
          {
1259
4.05k
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
4.05k
          }
1261
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
2.26k
        }
1263
1264
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
2.26k
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
2.26k
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
3.08k
    case CCALF_GET_STATISTICS:
1271
3.08k
      {
1272
3.08k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
2.50k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
2.26k
        if( checkReadyState )
1276
0
          return true;
1277
1278
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
2.26k
        if( slice.sps->ccalfEnabled )
1282
2.26k
        {
1283
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
4.05k
          {
1287
4.05k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
4.05k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
4.05k
          }
1290
2.26k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
2.26k
        }
1292
1293
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
2.26k
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
2.26k
      }
1298
0
      break;
1299
1300
1.29k
    case CCALF_DERIVE_FILTER:
1301
1.29k
      {
1302
        // synchronization dependencies
1303
1.29k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
1.29k
        if( ctuRsAddr == deriveFilterCtu )
1305
1.29k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
1.29k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
3.56k
          for( int y = 0; y < numCheckLines; y++ )
1309
2.26k
          {
1310
4.53k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
2.26k
            {
1312
2.26k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
2.26k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
0
                return false;
1315
2.26k
            }
1316
2.26k
          }
1317
1.29k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
1.29k
        if( checkReadyState )
1325
0
          return true;
1326
1327
1.29k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
1.29k
        if( slice.sps->ccalfEnabled )
1331
1.29k
        {
1332
1.29k
          if( ctuRsAddr == deriveFilterCtu )
1333
1.29k
          {
1334
1.29k
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
1.29k
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
1.29k
        }
1346
1.29k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
1.29k
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
1.29k
      }
1350
0
      break;
1351
1352
2.26k
    case CCALF_RECONSTRUCT:
1353
2.26k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
2.26k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
2.26k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
0
          return false;
1358
1359
2.26k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
2.26k
        if( checkReadyState )
1368
0
          return true;
1369
1370
2.26k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
2.26k
        if( slice.sps->ccalfEnabled )
1373
2.26k
        {
1374
2.26k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
6.31k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
4.04k
          {
1377
4.04k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
4.04k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
4.04k
          }
1380
2.26k
        }
1381
1382
2.26k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
2.26k
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
2.26k
        {
1388
2.26k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
2.26k
          const int margin = cs.picture->margin;
1390
2.26k
          recoBuf.extendBorderPelLft( y, height, margin );
1391
2.26k
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
2.26k
          if(ctuPosY == 0)
1393
1.29k
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
2.26k
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
1.29k
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
2.26k
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
2.26k
        }
1401
1402
        // perform finish only once for whole picture
1403
2.26k
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
2.26k
        if( ctuRsAddr < finishCtu )
1405
967
        {
1406
967
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
967
          return true;
1409
967
        }
1410
1.29k
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
1.29k
      }
1412
1413
1.94k
    case FINISH_SLICE:
1414
1.94k
      {
1415
1.94k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
5.25k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
3.95k
          if( processStates[ i ] < FINISH_SLICE )
1420
647
            return false;
1421
1422
1.29k
        if( checkReadyState )
1423
0
          return true;
1424
1425
1.29k
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
1.29k
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
1.29k
        return true;
1430
1.29k
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
27.0k
  }
1440
1441
21.5k
  return false;
1442
27.0k
}
bool vvenc::EncSlice::xProcessCtuTask<true>(int, void*)
Line
Count
Source
883
131M
{
884
131M
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
131M
  Picture* pic                   = ctuEncParam->pic;
886
131M
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
131M
  CodingStructure& cs            = *pic->cs;
888
131M
  Slice&           slice         = *cs.slice;
889
131M
  const PPS&       pps           = *slice.pps;
890
131M
  const PreCalcValues& pcv       = *cs.pcv;
891
131M
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
131M
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
131M
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
131M
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
131M
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
131M
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
131M
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
131M
  const int ctuStride            = pcv.widthInCtus;
899
131M
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
131M
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
131M
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
131M
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
131M
  const TaskType currState       = processStates[ ctuRsAddr ];
904
131M
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
131M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
131M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
131M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
131M
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
131M
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
131M
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
71.8M
    return false;
916
917
59.2M
  switch( currState )
918
59.2M
  {
919
    // encode
920
28.5M
    case CTU_ENCODE:
921
28.5M
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
28.5M
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
28.5M
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
28.5M
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
23.3M
          return false;
937
5.17M
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
5.16M
          return false;
939
        
940
4.02k
        if( checkReadyState )
941
4.04k
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
18.4E
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
18.4E
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
18.4E
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
18.4E
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
18.4E
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
18.4E
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
0
        {
960
0
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
0
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
0
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
0
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
0
        }
965
966
18.4E
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
18.4E
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
18.4E
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
17.5M
    case RESHAPE_LF_VER:
975
17.5M
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
17.5M
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
17.5M
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
17.5M
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
17.5M
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
17.5M
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
26.4M
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
17.5M
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
8.56M
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
8.95M
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
8.95M
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
8.95M
          return false;
1005
1006
3.97k
        if( checkReadyState )
1007
4.04k
          return true;
1008
1009
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
18.4E
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
18.4E
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
0
        {
1023
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
0
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
0
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
0
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
0
        }
1032
1033
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
18.4E
        processStates[ ctuRsAddr ] = LF_HOR;
1036
18.4E
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
503k
    case LF_HOR:
1041
503k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
503k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
168k
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
335k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
331k
          return false;
1051
1052
4.00k
        if( checkReadyState )
1053
4.05k
          return true;
1054
1055
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
18.4E
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
0
        {
1059
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
0
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
0
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
0
        }
1064
1065
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
18.4E
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
18.4E
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
314k
    case SAO_FILTER:
1073
314k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
314k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
255k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
230k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
213k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
7.27k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
4.03k
        if( checkReadyState )
1086
4.05k
          return true;
1087
1088
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
18.4E
        if( slice.sps->saoEnabled && pic->useSAO )
1092
0
        {
1093
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
0
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
0
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
0
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
0
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
0
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
0
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
0
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
0
        }
1104
1105
        // ALF border extension
1106
18.4E
        if( cs.sps->alfEnabled )
1107
0
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
0
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
0
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
0
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
0
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
0
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
0
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
0
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
0
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
0
        }
1122
1123
        // DMVR refinement can be stored now
1124
18.4E
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
0
        {
1126
0
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
0
        }
1128
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
18.4E
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
18.4E
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
18.4E
        if( ctuPosX == lastCtuColInTileRow )
1133
0
        {
1134
0
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
0
        }
1136
18.4E
        else
1137
18.4E
        {
1138
18.4E
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
18.4E
          return true;
1140
18.4E
        }
1141
18.4E
      }
1142
0
      break;
1143
1144
88.4k
    case ALF_GET_STATISTICS:
1145
88.4k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
88.4k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
88.4k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
2.26k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
2.26k
        if( checkReadyState )
1153
2.26k
          return true;
1154
1155
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
0
        if( slice.sps->alfEnabled )
1159
0
        {
1160
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
0
          {
1165
0
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
0
          }
1167
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
0
        }
1169
1170
0
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
0
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
0
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
0
      }
1176
0
      break;
1177
1178
1.98M
    case ALF_DERIVE_FILTER:
1179
1.98M
      {
1180
1.98M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.98M
        if( ctuRsAddr == deriveFilterCtu )
1182
1.98M
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.98M
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
2.00M
          for( int y = 0; y < numCheckLines; y++ )
1186
2.00M
          {
1187
2.01M
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
2.00M
            {
1189
2.00M
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
2.00M
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
1.98M
                return false;
1192
2.00M
            }
1193
2.00M
          }
1194
1.98M
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
1.29k
        if( checkReadyState )
1202
1.29k
          return true;
1203
1204
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
0
        if( slice.sps->alfEnabled )
1207
0
        {
1208
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
0
          if( ctuRsAddr == deriveFilterCtu )
1210
0
          {
1211
0
            encSlice->m_pALF->initDerivation( slice );
1212
0
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
0
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
0
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
0
        }
1228
1229
0
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
0
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
0
      }
1232
0
      break;
1233
1234
10.0M
    case ALF_RECONSTRUCT:
1235
10.0M
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
10.0M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
10.0M
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
10.0M
          return false;
1240
2.26k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
2.26k
        if( checkReadyState )
1249
2.26k
          return true;
1250
1251
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
18.4E
        if( slice.sps->alfEnabled )
1254
0
        {
1255
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
0
          {
1259
0
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
0
          }
1261
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
0
        }
1263
1264
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
18.4E
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
18.4E
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
5.73k
    case CCALF_GET_STATISTICS:
1271
5.73k
      {
1272
5.73k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
2.45k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
823
        if( checkReadyState )
1276
824
          return true;
1277
1278
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
18.4E
        if( slice.sps->ccalfEnabled )
1282
0
        {
1283
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
0
          {
1287
0
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
0
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
0
          }
1290
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
0
        }
1292
1293
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
18.4E
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
18.4E
      }
1298
0
      break;
1299
1300
208k
    case CCALF_DERIVE_FILTER:
1301
208k
      {
1302
        // synchronization dependencies
1303
208k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
208k
        if( ctuRsAddr == deriveFilterCtu )
1305
208k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
208k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
215k
          for( int y = 0; y < numCheckLines; y++ )
1309
214k
          {
1310
220k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
214k
            {
1312
214k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
214k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
207k
                return false;
1315
214k
            }
1316
214k
          }
1317
208k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
1.29k
        if( checkReadyState )
1325
1.29k
          return true;
1326
1327
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
0
        if( slice.sps->ccalfEnabled )
1331
0
        {
1332
0
          if( ctuRsAddr == deriveFilterCtu )
1333
0
          {
1334
0
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
0
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
0
        }
1346
0
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
0
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
0
      }
1350
0
      break;
1351
1352
11.3k
    case CCALF_RECONSTRUCT:
1353
11.3k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
11.3k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
11.3k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
9.05k
          return false;
1358
1359
2.26k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
2.26k
        if( checkReadyState )
1368
2.26k
          return true;
1369
1370
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
0
        if( slice.sps->ccalfEnabled )
1373
0
        {
1374
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
0
          {
1377
0
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
0
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
0
          }
1380
0
        }
1381
1382
0
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
0
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
0
        {
1388
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
0
          const int margin = cs.picture->margin;
1390
0
          recoBuf.extendBorderPelLft( y, height, margin );
1391
0
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
0
          if(ctuPosY == 0)
1393
0
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
0
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
0
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
0
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
0
        }
1401
1402
        // perform finish only once for whole picture
1403
0
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
0
        if( ctuRsAddr < finishCtu )
1405
0
        {
1406
0
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
0
          return true;
1409
0
        }
1410
0
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
0
      }
1412
1413
17.9k
    case FINISH_SLICE:
1414
17.9k
      {
1415
17.9k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
37.9k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
37.3k
          if( processStates[ i ] < FINISH_SLICE )
1420
17.2k
            return false;
1421
1422
647
        if( checkReadyState )
1423
647
          return true;
1424
1425
0
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
0
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
0
        return true;
1430
647
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
59.2M
  }
1440
1441
0
  return false;
1442
59.2M
}
1443
1444
void EncSlice::encodeSliceData( Picture* pic )
1445
1.29k
{
1446
1.29k
  CodingStructure& cs              = *pic->cs;
1447
1.29k
  Slice* const slice               = cs.slice;
1448
1.29k
  const uint32_t startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
1449
1.29k
  const uint32_t boundingCtuTsAddr = cs.pcv->sizeInCtus;
1450
1.29k
  const bool wavefrontsEnabled     = slice->sps->entropyCodingSyncEnabled;
1451
1452
  // this ensures that independently encoded bitstream chunks can be combined to bit-equal
1453
1.29k
  const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx;
1454
1.29k
  slice->encCABACTableIdx = cabacTableIdx;
1455
1456
  // initialise entropy coder for the slice
1457
1.29k
  m_CABACWriter.initCtxModels( *slice );
1458
1459
1.29k
  DTRACE( g_trace_ctx, D_HEADER, "=========== POC: %d ===========\n", slice->poc );
1460
1461
1.29k
  int prevQP[MAX_NUM_CH];
1462
1.29k
  prevQP[0] = prevQP[1] = slice->sliceQp;
1463
1464
1.29k
  const PreCalcValues& pcv        = *cs.pcv;
1465
1.29k
  const uint32_t widthInCtus      = pcv.widthInCtus;
1466
1.29k
  uint32_t uiSubStrm              = 0;
1467
1.29k
  const int numSubstreamsColumns  = slice->pps->numTileCols;
1468
1.29k
  const int numSubstreamRows      = slice->sps->entropyCodingSyncEnabled ? pic->cs->pcv->heightInCtus : slice->pps->numTileRows;
1469
1.29k
  const int numSubstreams         = std::max<int>( numSubstreamRows * numSubstreamsColumns, 0/*(int)pic->brickMap->bricks.size()*/ );
1470
1.29k
  std::vector<OutputBitstream> substreamsOut( numSubstreams );
1471
1472
1.29k
  slice->clearSubstreamSizes();
1473
1474
5.34k
  for( uint32_t ctuTsAddr = startCtuTsAddr; ctuTsAddr < boundingCtuTsAddr; ctuTsAddr++ )
1475
4.05k
  {
1476
4.05k
    const uint32_t ctuRsAddr            = slice->sliceMap.ctuAddrInSlice[ctuTsAddr];
1477
4.05k
    const uint32_t ctuXPosInCtus        = ctuRsAddr % widthInCtus;
1478
4.05k
    const uint32_t ctuYPosInCtus        = ctuRsAddr / widthInCtus;
1479
4.05k
    const uint32_t tileXPosInCtus       = slice->pps->tileColBd[cs.pps->ctuToTileCol[ctuXPosInCtus]];
1480
4.05k
    const uint32_t tileYPosInCtus       = slice->pps->tileRowBd[cs.pps->ctuToTileRow[ctuYPosInCtus]];
1481
1482
4.05k
    DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
1483
1484
4.05k
    const Position pos (ctuXPosInCtus * pcv.maxCUSize, ctuYPosInCtus * pcv.maxCUSize);
1485
4.05k
    const UnitArea ctuArea (cs.area.chromaFormat, Area(pos.x, pos.y, pcv.maxCUSize, pcv.maxCUSize));
1486
4.05k
    CHECK( uiSubStrm >= numSubstreams, "array index out of bounds" );
1487
4.05k
    m_CABACWriter.initBitstream( &substreamsOut[ uiSubStrm ] );
1488
1489
    // set up CABAC contexts' state for this CTU
1490
4.05k
    if (ctuXPosInCtus == tileXPosInCtus && ctuYPosInCtus == tileYPosInCtus )
1491
1.29k
    {
1492
1.29k
      if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset
1493
0
      {
1494
0
        m_CABACWriter.initCtxModels( *slice );
1495
0
      }
1496
1.29k
      prevQP[0] = prevQP[1] = slice->sliceQp;
1497
1.29k
    }
1498
2.75k
    else if (ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled)
1499
0
    {
1500
      // Synchronize cabac probabilities with upper-right CTU if it's available and at the start of a line.
1501
0
      if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset
1502
0
      {
1503
0
        m_CABACWriter.initCtxModels( *slice );
1504
0
      }
1505
0
      if( cs.getCURestricted( pos.offset( 0, -1 ), pos, slice->independentSliceIdx, slice->pps->getTileIdx( ctuXPosInCtus, ctuYPosInCtus ), CH_L, TREE_D ) )
1506
0
      {
1507
        // Top-right is available, so use it.
1508
0
        m_CABACWriter.getCtx() = m_entropyCodingSyncContextState;
1509
0
      }
1510
0
      prevQP[0] = prevQP[1] = slice->sliceQp;
1511
0
    }
1512
1513
4.05k
    m_CABACWriter.coding_tree_unit( cs, ctuArea, prevQP, ctuRsAddr );
1514
1515
    // store probabilities of second CTU in line into buffer
1516
4.05k
    if( ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled )
1517
0
    {
1518
0
      m_entropyCodingSyncContextState = m_CABACWriter.getCtx();
1519
0
    }
1520
1521
    // terminate the sub-stream, if required (end of slice-segment, end of tile, end of wavefront-CTU-row):
1522
4.05k
    bool isMoreCTUsinSlice = ctuTsAddr != (boundingCtuTsAddr - 1);
1523
4.05k
    bool isLastCTUinTile   = isMoreCTUsinSlice && slice->pps->getTileIdx( ctuRsAddr ) != slice->pps->getTileIdx( slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] );
1524
4.05k
    bool isLastCTUinWPP    = wavefrontsEnabled && isMoreCTUsinSlice && !isLastCTUinTile && ( (slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus) == cs.pps->tileColBd[cs.pps->ctuToTileCol[slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus]] ); //TODO: adjust tile bound condition
1525
1526
4.05k
    if (isLastCTUinWPP || !isMoreCTUsinSlice || isLastCTUinTile )         // this the the last CTU of either tile/brick/WPP/slice
1527
1.29k
    {
1528
1.29k
      m_CABACWriter.end_of_slice();
1529
1530
      // Byte-alignment in slice_data() when new tile
1531
1.29k
      substreamsOut[ uiSubStrm ].writeByteAlignment();
1532
1533
1.29k
      if (isMoreCTUsinSlice) //Byte alignment only when it is not the last substream in the slice
1534
0
      {
1535
        // write sub-stream size
1536
0
        slice->addSubstreamSize( ( substreamsOut[ uiSubStrm ].getNumberOfWrittenBits() >> 3 ) + substreamsOut[ uiSubStrm ].countStartCodeEmulations() );
1537
0
      }
1538
1.29k
      uiSubStrm++;
1539
1.29k
    }
1540
4.05k
  } // CTU-loop
1541
1542
1.29k
  if(slice->pps->cabacInitPresent)
1543
0
  {
1544
0
    m_encCABACTableIdx = m_CABACWriter.getCtxInitId( *slice );
1545
0
  }
1546
1.29k
  else
1547
1.29k
  {
1548
1.29k
    m_encCABACTableIdx = slice->sliceType;
1549
1.29k
  }
1550
1551
  // concatenate substreams
1552
1.29k
  OutputBitstream& outStream = pic->sliceDataStreams[ 0/*slice->sliceIdx*/ ];
1553
2.59k
  for ( int i = 0; i < slice->getNumberOfSubstreamSizes() + 1; i++ )
1554
1.29k
  {
1555
1.29k
    outStream.addSubstream( &(substreamsOut[ i ]) );
1556
1.29k
  }
1557
1.29k
  pic->sliceDataNumBins += m_CABACWriter.getNumBins();
1558
1.29k
}
1559
1560
} // namespace vvenc
1561
1562
//! \}
1563