Coverage Report

Created: 2026-06-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/vvenc/source/Lib/EncoderLib/EncSlice.cpp
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
44
/** \file     EncSlice.cpp
45
    \brief    slice encoder class
46
*/
47
48
#include "EncSlice.h"
49
#include "EncStage.h"
50
#include "EncLib.h"
51
#include "EncPicture.h"
52
#include "BitAllocation.h"
53
#include "CommonLib/UnitTools.h"
54
#include "CommonLib/Picture.h"
55
#include "CommonLib/TimeProfiler.h"
56
#include "CommonLib/dtrace_codingstruct.h"
57
#include "Utilities/NoMallocThreadPool.h"
58
59
#include <math.h>
60
#include "vvenc/vvencCfg.h"
61
62
//! \ingroup EncoderLib
63
//! \{
64
65
namespace vvenc {
66
67
#ifdef TRACE_ENABLE_ITT
68
static const __itt_domain* itt_domain_encode              = __itt_domain_create( "Encode" );
69
static const __itt_string_handle* itt_handle_ctuEncode    = __itt_string_handle_create( "Encode_CTU" );
70
static const __itt_string_handle* itt_handle_rspLfVer     = __itt_string_handle_create( "RspLfVer_CTU" );
71
static const __itt_string_handle* itt_handle_lfHor        = __itt_string_handle_create( "LfHor_CTU" );
72
static const __itt_string_handle* itt_handle_sao          = __itt_string_handle_create( "SAO_CTU" );
73
static const __itt_string_handle* itt_handle_alf_stat     = __itt_string_handle_create( "ALF_CTU_STAT" );
74
static const __itt_string_handle* itt_handle_alf_derive   = __itt_string_handle_create( "ALF_DERIVE" );
75
static const __itt_string_handle* itt_handle_alf_recon    = __itt_string_handle_create( "ALF_RECONSTRUCT" );
76
static const __itt_string_handle* itt_handle_ccalf_stat   = __itt_string_handle_create( "CCALF_CTU_STAT" );
77
static const __itt_string_handle* itt_handle_ccalf_derive = __itt_string_handle_create( "CCALF_DERIVE" );
78
static const __itt_string_handle* itt_handle_ccalf_recon  = __itt_string_handle_create( "CCALF_RECONSTRUCT" );
79
#endif
80
81
void setArbitraryWppPattern( const PreCalcValues& pcv, std::vector<int>& ctuAddrMap, int stepX = 1 )
82
4.34k
{
83
4.34k
  ctuAddrMap.resize( pcv.sizeInCtus, 0 );
84
4.34k
  std::vector<int> x_in_line( pcv.heightInCtus, 0 );
85
4.34k
  int x = 0, y = 0, addr = 0;
86
4.34k
  int y_top = 0;
87
4.34k
  const int step = stepX; // number of CTUs in x-direction to scan 
88
4.34k
  ctuAddrMap[addr++] = x++; // first entry (can be omitted)
89
10.1k
  while( addr < pcv.sizeInCtus )
90
6.55k
  {
91
    // fill entries in x-direction
92
6.55k
    int x1 = x;
93
15.5k
    while( x < std::min(x1 + step, (int)pcv.widthInCtus) )
94
8.98k
    {
95
      // general WPP condition (top-right CTU availability)
96
8.98k
      if( y > 0 && !( x_in_line[y - 1] - x >= 2 ) && x != pcv.widthInCtus - 1 )
97
0
        break;
98
8.98k
      ctuAddrMap[addr++] = y*pcv.widthInCtus + x;
99
8.98k
      x++;
100
8.98k
    }
101
6.55k
    x_in_line[y] = x;
102
        
103
6.55k
    y += 1;
104
105
6.55k
    if( y >= pcv.heightInCtus )
106
3.29k
    {
107
      // go up
108
3.29k
      if( x_in_line[y_top] >= pcv.widthInCtus )
109
3.29k
      {
110
3.29k
        y_top++;
111
3.29k
        if( y_top >= pcv.heightInCtus )
112
756
        {
113
          // done
114
756
          break;
115
756
        }
116
3.29k
      }
117
2.53k
      y = y_top;
118
2.53k
    }
119
5.80k
    x = x_in_line[y];
120
121
5.80k
    CHECK( y >= pcv.heightInCtus, "Height in CTUs is exceeded" );
122
5.80k
  }
123
4.34k
}
124
125
struct TileLineEncRsrc
126
{
127
  BitEstimator            m_BitEstimator;
128
  CABACWriter             m_CABACEstimator;
129
  BitEstimator            m_SaoBitEstimator;
130
  CABACWriter             m_SaoCABACEstimator;
131
  BitEstimator            m_AlfBitEstimator;
132
  CABACWriter             m_AlfCABACEstimator;
133
  ReuseUniMv              m_ReuseUniMv;
134
  BlkUniMvInfoBuffer      m_BlkUniMvInfoBuffer;
135
  AffineProfList          m_AffineProfList;
136
  IbcBvCand               m_CachedBvs;
137
  EncSampleAdaptiveOffset m_encSao;
138
  int                     m_prevQp[ MAX_NUM_CH ];
139
7.60k
  TileLineEncRsrc( const VVEncCfg& encCfg ) : m_CABACEstimator( m_BitEstimator ), m_SaoCABACEstimator( m_SaoBitEstimator ), m_AlfCABACEstimator( m_AlfBitEstimator ) { m_AffineProfList.init( ! encCfg.m_picReordering ); }
140
};
141
142
struct PerThreadRsrc
143
{
144
  CtxCache  m_CtxCache;
145
  EncCu     m_encCu;
146
  PelStorage m_alfTempCtuBuf;
147
};
148
149
struct CtuEncParam
150
{
151
  Picture*  pic;
152
  EncSlice* encSlice;
153
  int       ctuRsAddr;
154
  int       ctuPosX;
155
  int       ctuPosY;
156
  UnitArea  ctuArea;
157
  int       tileLineResIdx;
158
159
13.3k
  CtuEncParam() : pic( nullptr ), encSlice( nullptr ), ctuRsAddr( 0 ), ctuPosX( 0 ), ctuPosY( 0 ), ctuArea(), tileLineResIdx( 0 ) {}
160
  CtuEncParam( Picture* _p, EncSlice* _s, const int _r, const int _x, const int _y, const int _tileLineResIdx )
161
    : pic( _p )
162
    , encSlice( _s )
163
    , ctuRsAddr( _r )
164
    , ctuPosX( _x )
165
    , ctuPosY( _y )
166
    , ctuArea( pic->chromaFormat, pic->slices[0]->pps->pcv->getCtuArea( _x, _y ) )
167
0
    , tileLineResIdx( _tileLineResIdx ) {}
168
};
169
170
// ====================================================================================================================
171
// Constructor / destructor / create / destroy
172
// ====================================================================================================================
173
174
EncSlice::EncSlice()
175
4.34k
  : m_pcEncCfg           ( nullptr)
176
4.34k
  , m_threadPool         ( nullptr )
177
4.34k
  , m_ctuTasksDoneCounter( nullptr )
178
4.34k
  , m_ctuEncDelay        ( 1 )
179
4.34k
  , m_pLoopFilter        ( nullptr )
180
4.34k
  , m_pALF               ( nullptr )
181
4.34k
  , m_pcRateCtrl         ( nullptr )
182
4.34k
  , m_CABACWriter        ( m_BinEncoder )
183
4.34k
  , m_encCABACTableIdx   ( VVENC_I_SLICE )
184
4.34k
{
185
4.34k
}
186
187
188
EncSlice::~EncSlice()
189
4.34k
{
190
4.34k
  for( auto* lnRsc : m_TileLineEncRsrc )
191
7.60k
  {
192
7.60k
    delete lnRsc;
193
7.60k
  }
194
4.34k
  m_TileLineEncRsrc.clear();
195
196
4.34k
  for( auto* taskRsc: m_ThreadRsrc )
197
17.3k
  {
198
17.3k
    taskRsc->m_alfTempCtuBuf.destroy();
199
17.3k
    delete taskRsc;
200
17.3k
  }
201
4.34k
  m_ThreadRsrc.clear();
202
203
4.34k
  m_saoReconParams.clear();
204
205
17.6k
  for( int i = 0; i < m_saoStatData.size(); i++ )
206
13.3k
  {
207
53.3k
    for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ )
208
39.9k
    {
209
39.9k
      delete[] m_saoStatData[ i ][ compIdx ];
210
39.9k
    }
211
13.3k
    delete[] m_saoStatData[ i ];
212
13.3k
  }
213
4.34k
  m_saoStatData.clear();
214
4.34k
}
215
216
void EncSlice::init( const VVEncCfg& encCfg,
217
                     const SPS& sps,
218
                     const PPS& pps,
219
                     std::vector<int>* const globalCtuQpVector,
220
                     LoopFilter& loopFilter,
221
                     EncAdaptiveLoopFilter& alf,
222
                     RateCtrl& rateCtrl,
223
                     NoMallocThreadPool* threadPool,
224
                     WaitCounter* ctuTasksDoneCounter )
225
4.34k
{
226
4.34k
  m_pcEncCfg            = &encCfg;
227
4.34k
  m_pLoopFilter         = &loopFilter;
228
4.34k
  m_pALF                = &alf;
229
4.34k
  m_pcRateCtrl          = &rateCtrl;
230
4.34k
  m_threadPool          = threadPool;
231
4.34k
  m_ctuTasksDoneCounter = ctuTasksDoneCounter;
232
4.34k
  m_syncPicCtx.resize( encCfg.m_entropyCodingSyncEnabled ? pps.getNumTileLineIds() : 0 );
233
234
  
235
4.34k
  const int maxCntRscr = ( encCfg.m_numThreads > 0 ) ? pps.getNumTileLineIds() : 1;
236
4.34k
  const int maxCtuEnc  = ( encCfg.m_numThreads > 0 && threadPool ) ? threadPool->numThreads() : 1;
237
238
4.34k
  m_ThreadRsrc.resize( maxCtuEnc,  nullptr );
239
4.34k
  m_TileLineEncRsrc.resize( maxCntRscr, nullptr );
240
241
4.34k
  for( PerThreadRsrc*& taskRsc : m_ThreadRsrc )
242
17.3k
  {
243
17.3k
    taskRsc = new PerThreadRsrc();
244
17.3k
    taskRsc->m_encCu.init( encCfg,
245
17.3k
                           sps,
246
17.3k
                           globalCtuQpVector,
247
17.3k
                           m_syncPicCtx.data(),
248
17.3k
                           &rateCtrl );
249
17.3k
    taskRsc->m_alfTempCtuBuf.create( pps.pcv->chrFormat, Area( 0, 0, pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1), pps.pcv->maxCUSize + (MAX_ALF_PADDING_SIZE << 1) ), pps.pcv->maxCUSize, MAX_ALF_PADDING_SIZE, 0, false );
250
17.3k
  }
251
252
4.34k
  for( TileLineEncRsrc*& lnRsc : m_TileLineEncRsrc )
253
7.60k
  {
254
7.60k
    lnRsc = new TileLineEncRsrc( encCfg );
255
7.60k
    if( sps.saoEnabled )
256
7.60k
    {
257
7.60k
      lnRsc->m_encSao.init( encCfg );
258
7.60k
    }
259
7.60k
  }
260
261
4.34k
  const int sizeInCtus = pps.pcv->sizeInCtus;
262
4.34k
  m_processStates = std::vector<ProcessCtuState>( sizeInCtus );
263
4.34k
  m_saoReconParams.resize( sizeInCtus );
264
265
4.34k
  ::memset( m_saoDisabledRate, 0, sizeof( m_saoDisabledRate ) );
266
267
  // sao statistics
268
4.34k
  if( encCfg.m_bUseSAO )
269
4.34k
  {
270
4.34k
    m_saoStatData.resize( sizeInCtus );
271
17.6k
    for( int i = 0; i < sizeInCtus; i++ )
272
13.3k
    {
273
13.3k
      m_saoStatData[ i ] = new SAOStatData*[ MAX_NUM_COMP ];
274
53.3k
      for( int compIdx = 0; compIdx < MAX_NUM_COMP; compIdx++ )
275
39.9k
      {
276
39.9k
        m_saoStatData[ i ][ compIdx ] = new SAOStatData[ NUM_SAO_NEW_TYPES ];
277
39.9k
      }
278
13.3k
    }
279
4.34k
  }
280
4.34k
  ctuEncParams.resize( sizeInCtus );
281
4.34k
  setArbitraryWppPattern( *pps.pcv, m_ctuAddrMap, 3 );
282
283
4.34k
  const unsigned asuHeightInCtus = m_pALF->getAsuHeightInCtus();
284
4.34k
  const unsigned numDeriveLines  = encCfg.m_ifpLines ? 
285
4.34k
    std::min( ((encCfg.m_ifpLines & (~(asuHeightInCtus - 1))) + asuHeightInCtus), pps.pcv->heightInCtus ) : pps.pcv->heightInCtus;
286
4.34k
  m_alfDeriveCtu  = numDeriveLines * pps.pcv->widthInCtus - 1;
287
4.34k
  m_ccalfDeriveCtu = encCfg.m_ifpLines ? pps.pcv->widthInCtus * std::min((unsigned)encCfg.m_ifpLines + 1, pps.pcv->heightInCtus) - 1: pps.pcv->sizeInCtus - 1;
288
4.34k
}
289
290
291
void EncSlice::initPic( Picture* pic )
292
1.08k
{
293
1.08k
  Slice* slice = pic->cs->slice;
294
295
1.08k
  if( slice->pps->numTileCols * slice->pps->numTileRows > 1 )
296
0
  {
297
0
    slice->sliceMap = slice->pps->sliceMap[0];
298
0
  }
299
1.08k
  else
300
1.08k
  {
301
1.08k
    slice->sliceMap.addCtusToSlice( 0, pic->cs->pcv->widthInCtus, 0, pic->cs->pcv->heightInCtus, pic->cs->pcv->widthInCtus);
302
1.08k
  }
303
304
  // this ensures that independently encoded bitstream chunks can be combined to bit-equal
305
1.08k
  const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx;
306
1.08k
  slice->encCABACTableIdx = cabacTableIdx;
307
308
  // set QP and lambda values
309
1.08k
  xInitSliceLambdaQP( slice );
310
311
1.08k
  for( auto* thrRsc : m_ThreadRsrc )
312
4.34k
  {
313
4.34k
    thrRsc->m_encCu.initPic( pic );
314
4.34k
  }
315
316
1.08k
  for( auto* lnRsc : m_TileLineEncRsrc )
317
1.90k
  {
318
1.90k
    lnRsc->m_ReuseUniMv.resetReusedUniMvs();
319
1.90k
  }
320
321
1.08k
  m_ctuEncDelay = 1;
322
1.08k
  if( pic->useIBC )
323
1.08k
  {
324
    // IBC needs unfiltered samples up to max IBC search range
325
    // therefore ensure that numCtuDelayLUT CTU's have been enocded first
326
    // assuming IBC localSearchRangeX / Y = 128
327
1.08k
    const int numCtuDelayLUT[ 3 ] = { 15, 3, 1 };
328
1.08k
    CHECK( pic->cs->pcv->maxCUSizeLog2 < 5 || pic->cs->pcv->maxCUSizeLog2 > 7, "invalid max CTUSize" );
329
1.08k
    m_ctuEncDelay = numCtuDelayLUT[ pic->cs->pcv->maxCUSizeLog2 - 5 ];
330
1.08k
  }
331
1.08k
}
332
333
334
335
void EncSlice::xInitSliceLambdaQP( Slice* slice )
336
1.08k
{
337
  // pre-compute lambda and QP
338
1.08k
  const bool rcp = (m_pcEncCfg->m_RCTargetBitrate > 0 && slice->pic->picInitialQP >= 0); // 2nd pass
339
1.08k
  int  iQP = Clip3 (-slice->sps->qpBDOffset[CH_L], MAX_QP, slice->pic->picInitialQP); // RC start QP
340
1.08k
  double dQP     = (rcp ? (double) slice->pic->picInitialQP : xGetQPForPicture (slice));
341
1.08k
  double dLambda = (rcp ? slice->pic->picInitialLambda : xCalculateLambda (slice, slice->TLayer, dQP, dQP, iQP));
342
1.08k
  int sliceChromaQpOffsetIntraOrPeriodic[2] = { m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[0], m_pcEncCfg->m_sliceChromaQpOffsetIntraOrPeriodic[1] };
343
1.08k
  const int lookAheadRCCQpOffset = 0;   // was (m_pcEncCfg->m_RCTargetBitrate > 0 && m_pcEncCfg->m_LookAhead && CS::isDualITree (*slice->pic->cs) ? 1 : 0);
344
1.08k
  int cbQP = 0, crQP = 0, cbCrQP = 0;
345
346
1.08k
  if (m_pcEncCfg->m_usePerceptQPA) // adapt sliceChromaQpOffsetIntraOrPeriodic and pic->ctuAdaptedQP
347
1.08k
  {
348
1.08k
    const bool cqp = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0);
349
1.08k
    const uint32_t startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
350
1.08k
    const uint32_t boundingCtuTsAddr = slice->pic->cs->pcv->sizeInCtus;
351
352
1.08k
    if ((iQP = BitAllocation::applyQPAdaptationSlice (slice, m_pcEncCfg, iQP, dLambda, &slice->pic->picVA.visAct, // updates pic->picInitialQP
353
1.08k
                                                      *m_ThreadRsrc[0]->m_encCu.getQpPtr(), m_pcRateCtrl->getIntraPQPAStats(),
354
1.08k
                                                      (slice->pps->sliceChromaQpFlag && cqp ? sliceChromaQpOffsetIntraOrPeriodic : nullptr),
355
1.08k
                                                      m_pcRateCtrl->getMinNoiseLevels(), startCtuTsAddr, boundingCtuTsAddr)) >= 0) // QP OK?
356
1.08k
    {
357
1.08k
      dLambda *= pow (2.0, ((double) iQP - dQP) / 3.0); // adjust lambda based on change of slice QP
358
1.08k
    }
359
0
    else iQP = (int) dQP; // revert to unadapted slice QP
360
1.08k
  }
361
0
  else if (rcp)
362
0
  {
363
0
    slice->pic->picInitialQP = -1; // no QPA - unused now
364
0
  }
365
366
1.08k
  if (slice->pps->sliceChromaQpFlag && CS::isDualITree (*slice->pic->cs) && !m_pcEncCfg->m_usePerceptQPA && (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity == 0))
367
0
  {
368
0
    cbQP = m_pcEncCfg->m_chromaCbQpOffsetDualTree + lookAheadRCCQpOffset; // QP offset for dual-tree
369
0
    crQP = m_pcEncCfg->m_chromaCrQpOffsetDualTree + lookAheadRCCQpOffset;
370
0
    cbCrQP = m_pcEncCfg->m_chromaCbCrQpOffsetDualTree + lookAheadRCCQpOffset;
371
0
  }
372
1.08k
  else if (slice->pps->sliceChromaQpFlag)
373
1.08k
  {
374
1.08k
    const GOPEntry &gopEntry             = *(slice->pic->gopEntry);
375
1.08k
    const bool bUseIntraOrPeriodicOffset = (slice->isIntra() && !slice->sps->IBC) || (m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity > 0 && (slice->poc % m_pcEncCfg->m_sliceChromaQpOffsetPeriodicity) == 0);
376
377
1.08k
    cbQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[0] : gopEntry.m_CbQPoffset) + lookAheadRCCQpOffset;
378
1.08k
    crQP = (bUseIntraOrPeriodicOffset ? sliceChromaQpOffsetIntraOrPeriodic[1] : gopEntry.m_CrQPoffset) + lookAheadRCCQpOffset;
379
1.08k
    cbCrQP = (cbQP + crQP) >> 1; // use floor of average CbCr chroma QP offset for joint-CbCr coding
380
381
1.08k
    cbQP = Clip3 (-12, 12, cbQP + slice->pps->chromaQpOffset[COMP_Cb]) - slice->pps->chromaQpOffset[COMP_Cb];
382
1.08k
    crQP = Clip3 (-12, 12, crQP + slice->pps->chromaQpOffset[COMP_Cr]) - slice->pps->chromaQpOffset[COMP_Cr];
383
1.08k
    cbCrQP = Clip3 (-12, 12, cbCrQP + slice->pps->chromaQpOffset[COMP_JOINT_CbCr]) - slice->pps->chromaQpOffset[COMP_JOINT_CbCr];
384
1.08k
  }
385
386
1.08k
  slice->sliceChromaQpDelta[COMP_Cb] = Clip3 (-12, 12, cbQP);
387
1.08k
  slice->sliceChromaQpDelta[COMP_Cr] = Clip3 (-12, 12, crQP);
388
1.08k
  slice->sliceChromaQpDelta[COMP_JOINT_CbCr] = (slice->sps->jointCbCr ? Clip3 (-12, 12, cbCrQP) : 0);
389
390
1.08k
  for( auto& thrRsc : m_ThreadRsrc )
391
4.34k
  {
392
4.34k
    thrRsc->m_encCu.setUpLambda( *slice, dLambda, iQP, true, true );
393
4.34k
  }
394
395
1.08k
  slice->sliceQp            = iQP;
396
1.08k
  slice->chromaQpAdjEnabled = slice->pps->chromaQpOffsetListLen > 0;
397
1.08k
}
398
399
static const int highTL[6] = { -1, 0, 0, 2, 4, 5 };
400
401
int EncSlice::xGetQPForPicture( const Slice* slice )
402
1.08k
{
403
1.08k
  const int lumaQpBDOffset = slice->sps->qpBDOffset[ CH_L ];
404
1.08k
  int qp;
405
406
1.08k
  if ( m_pcEncCfg->m_costMode == VVENC_COST_LOSSLESS_CODING )
407
0
  {
408
0
    qp = LOSSLESS_AND_MIXED_LOSSLESS_RD_COST_TEST_QP;
409
0
  }
410
1.08k
  else
411
1.08k
  {
412
1.08k
    qp = m_pcEncCfg->m_QP + slice->pic->gopAdaptedQP;
413
414
1.08k
    if (m_pcEncCfg->m_usePerceptQPA)
415
1.08k
    {
416
1.08k
      const int tlayer = slice->pic->gopEntry->m_vtl;
417
418
1.08k
      qp = (slice->isIntra() ? std::min (qp, ((qp - std::min (3, floorLog2 (m_pcEncCfg->m_GOPSize) - 4/*TODO 3 with JVET-AC0149?*/)) * 15 + 3) >> 4) : highTL[tlayer] + ((qp * (16 + std::min (2, tlayer))) >> 4) + 0/*TODO +-1?*/);
419
1.08k
    }
420
0
    else if( slice->isIntra() )
421
0
    {
422
0
      qp += m_pcEncCfg->m_intraQPOffset;
423
0
    }
424
0
    else
425
0
    {
426
0
      if( qp != -lumaQpBDOffset )
427
0
      {
428
0
        const GOPEntry &gopEntry = *(slice->pic->gopEntry);
429
        // adjust QP according to the QP offset for the GOP entry.
430
0
        qp += gopEntry.m_QPOffset;
431
432
        // adjust QP according to QPOffsetModel for the GOP entry.
433
0
        double dqpOffset = qp * gopEntry.m_QPOffsetModelScale + gopEntry.m_QPOffsetModelOffset + 0.5;
434
0
        int qpOffset = (int)floor( Clip3<double>( 0.0, 3.0, dqpOffset ) );
435
0
        qp += qpOffset;
436
0
      }
437
0
    }
438
439
1.08k
    if( m_pcEncCfg->m_blockImportanceMapping && !slice->pic->m_picShared->m_ctuBimQpOffset.empty() )
440
0
    {
441
0
      qp += slice->pic->m_picShared->m_picAuxQpOffset;
442
0
    }
443
1.08k
  }
444
1.08k
  qp = Clip3( -lumaQpBDOffset, MAX_QP, qp );
445
1.08k
  return qp;
446
1.08k
}
447
448
449
double EncSlice::xCalculateLambda( const Slice* slice,
450
                                   const int    depth, // slice GOP hierarchical depth.
451
                                   const double refQP, // initial slice-level QP
452
                                   const double dQP,   // initial double-precision QP
453
                                         int&   iQP )  // returned integer QP.
454
1.08k
{
455
1.08k
  const GOPEntry &gopEntry = *(slice->pic->gopEntry);
456
1.08k
  const int SHIFT_QP       = 12;
457
1.08k
  const int temporalId     = gopEntry.m_temporalId;
458
1.08k
  std::vector<double> intraLambdaModifiers;
459
1.08k
  for ( int i = 0; i < VVENC_MAX_TLAYER; i++ )
460
1.08k
  {
461
1.08k
    if( m_pcEncCfg->m_adIntraLambdaModifier[i] != 0.0 ) intraLambdaModifiers.push_back( m_pcEncCfg->m_adIntraLambdaModifier[i] );
462
1.08k
    else break;
463
1.08k
  }
464
465
1.08k
  int bitdepth_luma_qp_scale = 6
466
1.08k
                               * (slice->sps->bitDepths[ CH_L ] - 8
467
1.08k
                                  - DISTORTION_PRECISION_ADJUSTMENT(slice->sps->bitDepths[ CH_L ]));
468
1.08k
  double qp_temp = dQP + bitdepth_luma_qp_scale - SHIFT_QP;
469
  // Case #1: I or P-slices (key-frame)
470
1.08k
  double dQPFactor = gopEntry.m_QPFactor;
471
1.08k
  if( slice->sliceType == VVENC_I_SLICE )
472
1.08k
  {
473
1.08k
    if (m_pcEncCfg->m_dIntraQpFactor>=0.0 && gopEntry.m_sliceType != 'I')
474
0
    {
475
0
      dQPFactor = m_pcEncCfg->m_dIntraQpFactor;
476
0
    }
477
1.08k
    else
478
1.08k
    {
479
1.08k
      dQPFactor = 0.57;
480
1.08k
      if( ! m_pcEncCfg->m_lambdaFromQPEnable )
481
0
      {
482
0
        const int NumberBFrames = ( m_pcEncCfg->m_GOPSize - 1 );
483
0
        const double dLambda_scale = 1.0 - Clip3( 0.0, 0.5, 0.05 * (double)NumberBFrames );
484
0
        dQPFactor *= dLambda_scale;
485
0
      }
486
1.08k
    }
487
1.08k
  }
488
0
  else if( m_pcEncCfg->m_lambdaFromQPEnable )
489
0
  {
490
0
    dQPFactor=0.57;
491
0
  }
492
493
1.08k
  double dLambda = dQPFactor*pow( 2.0, qp_temp/3.0 );
494
495
1.08k
  if( !(m_pcEncCfg->m_lambdaFromQPEnable) && depth>0 )
496
0
  {
497
0
    double qp_temp_ref = refQP + bitdepth_luma_qp_scale - SHIFT_QP;
498
0
    dLambda *= Clip3(2.00, 4.00, (qp_temp_ref / 6.0));   // (j == B_SLICE && p_cur_frm->layer != 0 )
499
0
  }
500
501
  // if hadamard is used in ME process
502
1.08k
  if ( !m_pcEncCfg->m_bUseHADME && slice->sliceType != VVENC_I_SLICE )
503
0
  {
504
0
    dLambda *= 0.95;
505
0
  }
506
507
1.08k
  double lambdaModifier;
508
1.08k
  if( slice->sliceType != VVENC_I_SLICE || intraLambdaModifiers.empty())
509
1.08k
  {
510
1.08k
    lambdaModifier = m_pcEncCfg->m_adLambdaModifier[ temporalId ];
511
1.08k
  }
512
0
  else
513
0
  {
514
0
    lambdaModifier = intraLambdaModifiers[ (temporalId < intraLambdaModifiers.size()) ? temporalId : (intraLambdaModifiers.size()-1) ];
515
0
  }
516
1.08k
  dLambda *= lambdaModifier;
517
518
1.08k
  iQP = Clip3( -slice->sps->qpBDOffset[ CH_L ], MAX_QP, (int) floor( dQP + 0.5 ) );
519
520
1.08k
  if( m_pcEncCfg->m_DepQuantEnabled )
521
1.08k
  {
522
1.08k
    dLambda *= pow( 2.0, 0.25/3.0 ); // slight lambda adjustment for dependent quantization (due to different slope of quantizer)
523
1.08k
  }
524
525
  // NOTE: the lambda modifiers that are sometimes applied later might be best always applied in here.
526
1.08k
  return dLambda;
527
1.08k
}
528
529
530
// ====================================================================================================================
531
// Public member functions
532
// ====================================================================================================================
533
534
535
/** \param pic   picture class
536
 */
537
void EncSlice::compressSlice( Picture* pic )
538
1.08k
{
539
1.08k
  PROFILER_SCOPE_AND_STAGE( 1, g_timeProfiler, P_COMPRESS_SLICE );
540
1.08k
  CodingStructure& cs         = *pic->cs;
541
1.08k
  Slice* const slice          = cs.slice;
542
1.08k
  uint32_t  startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
543
1.08k
  uint32_t  boundingCtuTsAddr = pic->cs->pcv->sizeInCtus;
544
545
1.08k
  cs.pcv      = slice->pps->pcv;
546
1.08k
  cs.fracBits = 0;
547
548
1.08k
  if( startCtuTsAddr == 0 )
549
1.08k
  {
550
1.08k
    cs.initStructData( slice->sliceQp );
551
1.08k
  }
552
553
1.08k
  for( auto* thrRsrc : m_ThreadRsrc )
554
4.34k
  {
555
4.34k
    thrRsrc->m_encCu.initSlice( slice );
556
4.34k
  }
557
558
1.08k
  for( auto* lnRsrc : m_TileLineEncRsrc )
559
1.90k
  {
560
1.90k
    lnRsrc->m_CABACEstimator    .initCtxModels( *slice );
561
1.90k
    lnRsrc->m_SaoCABACEstimator .initCtxModels( *slice );
562
1.90k
    lnRsrc->m_AlfCABACEstimator .initCtxModels( *slice );
563
1.90k
    lnRsrc->m_AffineProfList    .resetAffineMVList();
564
1.90k
    lnRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
565
1.90k
    lnRsrc->m_CachedBvs         .resetIbcBvCand();
566
567
1.90k
    if( slice->sps->saoEnabled && pic->useSAO )
568
1.90k
    {
569
1.90k
      lnRsrc->m_encSao          .initSlice( slice );
570
1.90k
    }
571
1.90k
  }
572
573
1.08k
  if( slice->sps->fpelMmvd && !slice->picHeader->disFracMMVD )
574
1.08k
  {
575
1.08k
    slice->picHeader->disFracMMVD = ( pic->lwidth() * pic->lheight() > 1920 * 1080 ) ? true : false;
576
1.08k
  }
577
578
1.08k
  xProcessCtus( pic, startCtuTsAddr, boundingCtuTsAddr );
579
1.08k
}
580
581
void setJointCbCrModes( CodingStructure& cs, const Position topLeftLuma, const Size sizeLuma )
582
1.08k
{
583
1.08k
  bool              sgnFlag = true;
584
585
1.08k
  if( isChromaEnabled( cs.picture->chromaFormat) )
586
1.08k
  {
587
1.08k
    const CompArea  cbArea  = CompArea( COMP_Cb, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true );
588
1.08k
    const CompArea  crArea  = CompArea( COMP_Cr, cs.picture->chromaFormat, Area(topLeftLuma,sizeLuma), true );
589
590
1.08k
    const CPelBuf   orgCb   = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( cbArea ): cs.picture->getOrigBuf( cbArea );
591
1.08k
    const CPelBuf   orgCr   = cs.picture->getFilteredOrigBuffer().valid() ? cs.picture->getRspOrigBuf( crArea ): cs.picture->getOrigBuf( crArea );
592
1.08k
    const int       x0      = ( cbArea.x > 0 ? 0 : 1 );
593
1.08k
    const int       y0      = ( cbArea.y > 0 ? 0 : 1 );
594
1.08k
    const int       x1      = ( cbArea.x + cbArea.width  < cs.picture->Cb().width  ? cbArea.width  : cbArea.width  - 1 );
595
1.08k
    const int       y1      = ( cbArea.y + cbArea.height < cs.picture->Cb().height ? cbArea.height : cbArea.height - 1 );
596
1.08k
    const int       cbs     = orgCb.stride;
597
1.08k
    const int       crs     = orgCr.stride;
598
1.08k
    const Pel*      pCb     = orgCb.buf + y0 * cbs;
599
1.08k
    const Pel*      pCr     = orgCr.buf + y0 * crs;
600
1.08k
    int64_t         sumCbCr = 0;
601
602
    // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes
603
80.5k
    for( int y = y0; y < y1; y++, pCb += cbs, pCr += crs )
604
79.4k
    {
605
6.24M
      for( int x = x0; x < x1; x++ )
606
6.16M
      {
607
6.16M
        int cb = ( 12*(int)pCb[x] - 2*((int)pCb[x-1] + (int)pCb[x+1] + (int)pCb[x-cbs] + (int)pCb[x+cbs]) - ((int)pCb[x-1-cbs] + (int)pCb[x+1-cbs] + (int)pCb[x-1+cbs] + (int)pCb[x+1+cbs]) );
608
6.16M
        int cr = ( 12*(int)pCr[x] - 2*((int)pCr[x-1] + (int)pCr[x+1] + (int)pCr[x-crs] + (int)pCr[x+crs]) - ((int)pCr[x-1-crs] + (int)pCr[x+1-crs] + (int)pCr[x-1+crs] + (int)pCr[x+1+crs]) );
609
6.16M
        sumCbCr += cb*cr;
610
6.16M
      }
611
79.4k
    }
612
613
1.08k
    sgnFlag = ( sumCbCr < 0 );
614
1.08k
  }
615
616
1.08k
  cs.slice->picHeader->jointCbCrSign = sgnFlag;
617
1.08k
}
618
619
struct CtuPos
620
{
621
  const int ctuPosX;
622
  const int ctuPosY;
623
  const int ctuRsAddr;
624
625
3.33k
  CtuPos( int _x, int _y, int _a ) : ctuPosX( _x ), ctuPosY( _y ), ctuRsAddr( _a ) {}
626
};
627
628
class CtuTsIterator
629
{
630
  private:
631
    const CodingStructure& cs;
632
    const int        m_startTsAddr;
633
    const int        m_endTsAddr;
634
    std::vector<int> m_ctuAddrMap;
635
          int        m_ctuTsAddr;
636
637
  private:
638
    int getNextTsAddr( const int _tsAddr ) const
639
3.33k
    {
640
3.33k
      const PreCalcValues& pcv  = *cs.pcv;
641
3.33k
      const int startSliceRsRow = m_startTsAddr / pcv.widthInCtus;
642
3.33k
      const int startSliceRsCol = m_startTsAddr % pcv.widthInCtus;
643
3.33k
      const int endSliceRsRow   = (m_endTsAddr - 1) / pcv.widthInCtus;
644
3.33k
      const int endSliceRsCol   = (m_endTsAddr - 1) % pcv.widthInCtus;
645
3.33k
            int ctuTsAddr = _tsAddr;
646
3.33k
      CHECK( ctuTsAddr > m_endTsAddr, "error: array index out of bounds" );
647
4.41k
      while( ctuTsAddr < m_endTsAddr )
648
3.33k
      {
649
3.33k
        ctuTsAddr++;
650
3.33k
        const int ctuRsAddr = ctuTsAddr; 
651
3.33k
        if( cs.slice->pps->rectSlice
652
3.33k
            && ( (ctuRsAddr / pcv.widthInCtus) < startSliceRsRow
653
3.33k
              || (ctuRsAddr / pcv.widthInCtus) > endSliceRsRow
654
2.24k
              || (ctuRsAddr % pcv.widthInCtus) < startSliceRsCol
655
2.24k
              || (ctuRsAddr % pcv.widthInCtus) > endSliceRsCol ) )
656
1.08k
          continue;
657
2.24k
        break;
658
3.33k
      }
659
3.33k
      return ctuTsAddr;
660
3.33k
    }
661
662
    int mapAddr( const int _addr ) const
663
3.33k
    {
664
3.33k
      if( _addr < 0 )
665
0
        return _addr;
666
3.33k
      if( _addr >= m_ctuAddrMap.size() )
667
0
        return _addr;
668
3.33k
      return m_ctuAddrMap[ _addr ];
669
3.33k
    }
670
671
  public:
672
1.08k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e,       std::vector<int>& _m         ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {}
673
0
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, bool _wpp                          ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ),                     m_ctuTsAddr( _s ) { if( _wpp ) setWppPattern(); }
674
0
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m         ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( _s ) {}
675
1.08k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>& _m, int _c ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuAddrMap( _m ), m_ctuTsAddr( std::max( _s, _c ) ) {}
676
1.08k
    CtuTsIterator( const CodingStructure& _cs, int _s, int _e, const std::vector<int>* _m, bool _wpp ) : cs( _cs ), m_startTsAddr( _s ), m_endTsAddr( _e ), m_ctuTsAddr( _s ) {  if( _wpp ) m_ctuAddrMap = *_m;  }
677
678
7.67k
    virtual ~CtuTsIterator() { m_ctuAddrMap.clear(); }
679
680
3.33k
    CtuTsIterator& operator++()                { m_ctuTsAddr = getNextTsAddr( m_ctuTsAddr ); return *this; }
681
0
    CtuTsIterator  operator++(int)             { auto retval = *this; ++(*this); return retval; }
682
0
    bool operator==(CtuTsIterator other) const { return m_ctuTsAddr == other.m_ctuTsAddr; }
683
4.41k
    bool operator!=(CtuTsIterator other) const { return m_ctuTsAddr != other.m_ctuTsAddr; }
684
3.33k
    CtuPos operator*()                   const { const int ctuRsAddr = mapAddr( m_ctuTsAddr );  return CtuPos( ctuRsAddr % cs.pcv->widthInCtus, ctuRsAddr / cs.pcv->widthInCtus, ctuRsAddr ); }
685
686
1.08k
    CtuTsIterator begin() { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap ); };
687
1.08k
    CtuTsIterator end()   { return CtuTsIterator( cs, m_startTsAddr, m_endTsAddr, m_ctuAddrMap, m_endTsAddr ); };
688
689
    using iterator_category = std::forward_iterator_tag;
690
    using value_type        = int;
691
    using pointer           = int*;
692
    using reference         = int&;
693
    using difference_type   = ptrdiff_t;
694
695
    void setWppPattern()
696
0
    {
697
0
      const PreCalcValues& pcv = *cs.pcv;
698
0
      m_ctuAddrMap.resize( pcv.sizeInCtus, 0 );
699
0
      int addr = 0;
700
0
      for( int i = 1; i < pcv.sizeInCtus; i++ )
701
0
      {
702
0
        int x = addr % pcv.widthInCtus;
703
0
        int y = addr / pcv.widthInCtus;
704
0
        x -= 1;
705
0
        y += 1;
706
0
        if( x < 0 || y >= pcv.heightInCtus )
707
0
        {
708
0
          x += 1 + y;
709
0
          y  = 0;
710
0
        }
711
0
        if( x >= pcv.widthInCtus )
712
0
        {
713
0
          y += ( x - pcv.widthInCtus ) + 1;
714
0
          x  = pcv.widthInCtus - 1;
715
0
        }
716
0
        addr = y * pcv.widthInCtus + x;
717
0
        m_ctuAddrMap[ i ] = addr;
718
0
      }
719
0
    }
720
};
721
722
void EncSlice::saoDisabledRate( CodingStructure& cs, SAOBlkParam* reconParams )
723
0
{
724
0
  EncSampleAdaptiveOffset::disabledRate( cs, m_saoDisabledRate, reconParams, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
725
0
}
726
727
void EncSlice::finishCompressSlice( Picture* pic, Slice& slice )
728
1.08k
{
729
1.08k
  CodingStructure& cs = *pic->cs;
730
731
  // finalize
732
1.08k
  if( slice.sps->saoEnabled && pic->useSAO )
733
1.08k
  {
734
    // store disabled statistics
735
1.08k
    if( !m_pcEncCfg->m_numThreads )
736
0
      saoDisabledRate( cs, &m_saoReconParams[ 0 ] );
737
738
    // set slice header flags
739
1.08k
    CHECK( m_saoEnabled[ COMP_Cb ] != m_saoEnabled[ COMP_Cr ], "Unspecified error");
740
1.08k
    for( auto s : pic->slices )
741
1.08k
    {
742
1.08k
      s->saoEnabled[ CH_L ] = m_saoEnabled[ COMP_Y  ];
743
1.08k
      s->saoEnabled[ CH_C ] = m_saoEnabled[ COMP_Cb ];
744
1.08k
    }
745
1.08k
  }
746
1.08k
}
747
748
void EncSlice::xProcessCtus( Picture* pic, const unsigned startCtuTsAddr, const unsigned boundingCtuTsAddr )
749
1.08k
{
750
1.08k
  PROFILER_SCOPE_TOP_LEVEL_EXT( 1, g_timeProfiler, P_IGNORE, pic->cs );
751
1.08k
  CodingStructure& cs      = *pic->cs;
752
1.08k
  Slice&           slice   = *cs.slice;
753
1.08k
  const PreCalcValues& pcv = *cs.pcv;
754
755
  // initialization
756
1.08k
  if( slice.sps->jointCbCr )
757
1.08k
  {
758
1.08k
    setJointCbCrModes( cs, Position(0, 0), cs.area.lumaSize() );
759
1.08k
  }
760
761
1.08k
  if( slice.sps->saoEnabled && pic->useSAO )
762
1.08k
  {
763
    // check SAO enabled or disabled
764
1.08k
    EncSampleAdaptiveOffset::decidePicParams( cs, m_saoDisabledRate, m_saoEnabled, m_pcEncCfg->m_saoEncodingRate, m_pcEncCfg->m_saoEncodingRateChroma, m_pcEncCfg->m_internChromaFormat );
765
766
1.08k
    m_saoAllDisabled = true;
767
4.34k
    for( int compIdx = 0; compIdx < getNumberValidComponents( pcv.chrFormat ); compIdx++ )
768
3.25k
    {
769
3.25k
      m_saoAllDisabled &= ! m_saoEnabled[ compIdx ];
770
3.25k
    }
771
772
1.08k
    std::fill( m_saoReconParams.begin(), m_saoReconParams.end(), SAOBlkParam() );
773
1.08k
  }
774
0
  else
775
0
  {
776
0
    m_saoAllDisabled = true;
777
0
  }
778
779
1.08k
  if( slice.sps->alfEnabled )
780
1.08k
  {
781
1.08k
    m_pALF->initEncProcess( slice );
782
1.08k
  }
783
784
1.08k
  std::fill( m_processStates.begin(), m_processStates.end(), CTU_ENCODE );
785
786
  // fill encoder parameter list
787
1.08k
  int idx = 0;
788
1.08k
  const std::vector<int> base = slice.sliceMap.ctuAddrInSlice;
789
1.08k
  auto ctuIter = CtuTsIterator( cs, startCtuTsAddr, boundingCtuTsAddr, &m_ctuAddrMap, m_pcEncCfg->m_numThreads > 0 );
790
1.08k
  for( auto ctuPos : ctuIter )
791
3.33k
  {
792
3.33k
    ctuEncParams[ idx ].pic       = pic;
793
3.33k
    ctuEncParams[ idx ].encSlice  = this;
794
3.33k
    ctuEncParams[ idx ].ctuRsAddr = ctuPos.ctuRsAddr;
795
3.33k
    ctuEncParams[ idx ].ctuPosX   = ctuPos.ctuPosX;
796
3.33k
    ctuEncParams[ idx ].ctuPosY   = ctuPos.ctuPosY;
797
3.33k
    ctuEncParams[ idx ].ctuArea   = UnitArea( pic->chromaFormat, slice.pps->pcv->getCtuArea( ctuPos.ctuPosX, ctuPos.ctuPosY ) );
798
799
3.33k
    if( m_pcEncCfg->m_numThreads > 0 )
800
3.33k
    {
801
3.33k
      ctuEncParams[idx].tileLineResIdx = slice.pps->getTileLineId( ctuPos.ctuPosX, ctuPos.ctuPosY );
802
3.33k
    }
803
0
    else
804
0
    {
805
0
      ctuEncParams[idx].tileLineResIdx = 0;
806
0
    }
807
3.33k
    idx++;
808
3.33k
  }
809
810
  //for( int i = 0; i < idx; i++ )
811
  //{
812
  //  for( int j = i; j < idx; j++ )
813
  //  {
814
  //    if( ctuEncParams[i].tileLineResIdx != ctuEncParams[j].tileLineResIdx ) continue;
815
  //
816
  //    CHECK( ctuEncParams[i].ctuPosY != ctuEncParams[j].ctuPosY, "Not the same CTU line!" );
817
  //    CHECK( slice.pps->getTileIdx( ctuEncParams[i].ctuPosX, ctuEncParams[i].ctuPosY ) != slice.pps->getTileIdx( ctuEncParams[j].ctuPosX, ctuEncParams[j].ctuPosY ), "Not the same tile!" );
818
  //  }
819
  //}
820
821
1.08k
  CHECK( idx != pcv.sizeInCtus, "array index out of bounds" );
822
823
  // process ctu's until last ctu is done
824
1.08k
  if( m_pcEncCfg->m_numThreads > 0 )
825
1.08k
  {
826
1.08k
    for( auto& ctuEncParam : ctuEncParams )
827
3.33k
    {
828
3.33k
      m_threadPool->addBarrierTask( EncSlice::xProcessCtuTask<false>,
829
3.33k
                                    &ctuEncParam,
830
3.33k
                                    m_ctuTasksDoneCounter,
831
3.33k
                                    nullptr,
832
3.33k
                                    {},
833
3.33k
                                    EncSlice::xProcessCtuTask<true> );
834
3.33k
    }
835
1.08k
  }
836
0
  else
837
0
  {
838
0
    do
839
0
    {
840
0
      for( auto& ctuEncParam : ctuEncParams )
841
0
      {
842
0
        if( m_processStates[ctuEncParam.ctuRsAddr] != PROCESS_DONE )
843
0
          EncSlice::xProcessCtuTask<false>( 0, &ctuEncParam );
844
0
      }
845
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_LUMA_LF,   cs, cs.getRecoBuf(), COMP_Y  );
846
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cb );
847
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == SAO_FILTER && m_processStates[ boundingCtuTsAddr - 1 ] == SAO_FILTER, D_REC_CB_CHROMA_LF, cs, cs.getRecoBuf(), COMP_Cr );
848
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_LUMA_SAO,   cs, cs.getRecoBuf(), COMP_Y  );
849
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cb );
850
0
      DTRACE_PIC_COMP_COND( m_processStates[ 0 ] == ALF_GET_STATISTICS && m_processStates[ boundingCtuTsAddr - 1 ] == ALF_GET_STATISTICS, D_REC_CB_CHROMA_SAO, cs, cs.getRecoBuf(), COMP_Cr );
851
0
    }
852
0
    while( m_processStates[ boundingCtuTsAddr - 1 ] != PROCESS_DONE );
853
0
  }
854
1.08k
}
855
856
inline bool checkCtuTaskNbTop( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
857
722k
{
858
722k
  return ctuPosY > 0 && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus ] <= tskType;
859
722k
}
860
861
inline bool checkCtuTaskNbBot( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
862
282k
{
863
282k
  return ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 0, 1 ) ) && processStates[ ctuRsAddr     + pps.pcv->widthInCtus ] <= tskType;
864
282k
}
865
866
inline bool checkCtuTaskNbRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
867
571k
{
868
571k
  return ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, 0 ) ) && processStates[ ctuRsAddr + 1 ] <= tskType;
869
571k
}
870
871
inline bool checkCtuTaskNbTopRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, bool override = false )
872
243k
{
873
243k
  return ctuPosY > 0 && ctuPosX + 1 < pps.pcv->widthInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, 1, -1 ) ) && processStates[ ctuRsAddr - pps.pcv->widthInCtus + 1 ] <= tskType;
874
243k
}
875
876
inline bool checkCtuTaskNbBotRgt( const PPS& pps, const int& ctuPosX, const int& ctuPosY, const int& ctuRsAddr, const ProcessCtuState* processStates, const TaskType tskType, const int rightOffset = 1, bool override = false )
877
7.32M
{
878
7.32M
  return ctuPosX + rightOffset < pps.pcv->widthInCtus && ctuPosY + 1 < pps.pcv->heightInCtus && ( override || pps.canFilterCtuBdry( ctuPosX, ctuPosY, rightOffset, 1 ) ) && processStates[ ctuRsAddr + rightOffset + pps.pcv->widthInCtus ] <= tskType;
879
7.32M
}
880
881
template<bool checkReadyState>
882
bool EncSlice::xProcessCtuTask( int threadIdx, void* taskParam )
883
106M
{
884
106M
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
106M
  Picture* pic                   = ctuEncParam->pic;
886
106M
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
106M
  CodingStructure& cs            = *pic->cs;
888
106M
  Slice&           slice         = *cs.slice;
889
106M
  const PPS&       pps           = *slice.pps;
890
106M
  const PreCalcValues& pcv       = *cs.pcv;
891
106M
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
106M
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
106M
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
106M
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
106M
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
106M
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
106M
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
106M
  const int ctuStride            = pcv.widthInCtus;
899
106M
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
106M
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
106M
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
106M
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
106M
  const TaskType currState       = processStates[ ctuRsAddr ];
904
106M
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
106M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
106M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
106M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
106M
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
106M
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
106M
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
57.5M
    return false;
916
917
49.4M
  switch( currState )
918
49.4M
  {
919
    // encode
920
24.2M
    case CTU_ENCODE:
921
24.2M
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
24.2M
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
24.2M
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
24.2M
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
20.0M
          return false;
937
4.26M
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
4.26M
          return false;
939
        
940
6.65k
        if( checkReadyState )
941
3.32k
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
3.32k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
3.32k
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
3.32k
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
3.32k
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
3.32k
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
3.32k
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
3.33k
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
1.90k
        {
960
1.90k
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
1.90k
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
1.90k
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
1.90k
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
1.90k
        }
965
966
3.32k
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
3.32k
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
3.32k
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
3.32k
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
14.1M
    case RESHAPE_LF_VER:
975
14.1M
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
14.1M
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
14.1M
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
14.1M
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
14.1M
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
14.1M
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
21.4M
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
14.1M
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
6.81M
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
7.31M
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
7.31M
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
7.30M
          return false;
1005
1006
6.57k
        if( checkReadyState )
1007
3.33k
          return true;
1008
1009
3.24k
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
3.24k
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
3.24k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
3.33k
        {
1023
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
3.33k
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
3.33k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
3.33k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
3.33k
        }
1032
1033
3.24k
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
3.24k
        processStates[ ctuRsAddr ] = LF_HOR;
1036
3.24k
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
416k
    case LF_HOR:
1041
416k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
416k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
138k
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
278k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
271k
          return false;
1051
1052
6.61k
        if( checkReadyState )
1053
3.33k
          return true;
1054
1055
3.28k
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
3.28k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
3.33k
        {
1059
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
3.33k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
3.33k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
3.33k
        }
1064
1065
3.28k
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
3.28k
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
3.28k
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
299k
    case SAO_FILTER:
1073
299k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
299k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
243k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
221k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
205k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
10.5k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
6.65k
        if( checkReadyState )
1086
3.33k
          return true;
1087
1088
3.32k
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
3.33k
        if( slice.sps->saoEnabled && pic->useSAO )
1092
3.33k
        {
1093
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
3.33k
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
3.33k
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
3.33k
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
3.33k
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
3.33k
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
3.33k
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
3.33k
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
3.33k
        }
1104
1105
        // ALF border extension
1106
3.32k
        if( cs.sps->alfEnabled )
1107
3.33k
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
3.33k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
3.33k
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
3.33k
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
3.33k
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
3.33k
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
3.33k
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
3.33k
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
3.33k
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
3.33k
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
3.33k
        }
1122
1123
        // DMVR refinement can be stored now
1124
3.33k
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
3.33k
        {
1126
3.33k
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
3.33k
        }
1128
3.32k
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
3.32k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
3.32k
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
3.32k
        if( ctuPosX == lastCtuColInTileRow )
1133
1.90k
        {
1134
1.90k
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
1.90k
        }
1136
1.42k
        else
1137
1.42k
        {
1138
1.42k
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
1.42k
          return true;
1140
1.42k
        }
1141
3.32k
      }
1142
1.90k
      break;
1143
1144
72.5k
    case ALF_GET_STATISTICS:
1145
72.5k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
72.5k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
72.5k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
3.80k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
3.80k
        if( checkReadyState )
1153
1.90k
          return true;
1154
1155
1.90k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
1.90k
        if( slice.sps->alfEnabled )
1159
1.90k
        {
1160
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
1.90k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
5.23k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
3.33k
          {
1165
3.33k
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
3.33k
          }
1167
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
1.90k
        }
1169
1170
1.90k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
1.90k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
1.90k
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
1.90k
      }
1176
0
      break;
1177
1178
1.62M
    case ALF_DERIVE_FILTER:
1179
1.62M
      {
1180
1.62M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.62M
        if( ctuRsAddr == deriveFilterCtu )
1182
1.62M
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.62M
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
1.63M
          for( int y = 0; y < numCheckLines; y++ )
1186
1.63M
          {
1187
1.65M
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
1.63M
            {
1189
1.63M
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
1.63M
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
1.61M
                return false;
1192
1.63M
            }
1193
1.63M
          }
1194
1.62M
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
2.17k
        if( checkReadyState )
1202
1.08k
          return true;
1203
1204
1.08k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
1.08k
        if( slice.sps->alfEnabled )
1207
1.08k
        {
1208
1.08k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
1.08k
          if( ctuRsAddr == deriveFilterCtu )
1210
1.08k
          {
1211
1.08k
            encSlice->m_pALF->initDerivation( slice );
1212
1.08k
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
1.08k
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
1.08k
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
1.08k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
1.08k
        }
1228
1229
1.08k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
1.08k
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
1.08k
      }
1232
0
      break;
1233
1234
8.44M
    case ALF_RECONSTRUCT:
1235
8.44M
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
8.44M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
8.44M
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
8.44M
          return false;
1240
3.79k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
3.79k
        if( checkReadyState )
1249
1.90k
          return true;
1250
1251
1.89k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
1.89k
        if( slice.sps->alfEnabled )
1254
1.90k
        {
1255
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
5.23k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
3.33k
          {
1259
3.33k
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
3.33k
          }
1261
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
1.90k
        }
1263
1264
1.89k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
1.89k
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
1.89k
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
6.71k
    case CCALF_GET_STATISTICS:
1271
6.71k
      {
1272
6.71k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
4.06k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
2.58k
        if( checkReadyState )
1276
688
          return true;
1277
1278
1.89k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
1.89k
        if( slice.sps->ccalfEnabled )
1282
1.90k
        {
1283
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
5.23k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
3.33k
          {
1287
3.33k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
3.33k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
3.33k
          }
1290
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
1.90k
        }
1292
1293
1.89k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
1.89k
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
1.89k
      }
1298
0
      break;
1299
1300
138k
    case CCALF_DERIVE_FILTER:
1301
138k
      {
1302
        // synchronization dependencies
1303
138k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
138k
        if( ctuRsAddr == deriveFilterCtu )
1305
138k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
138k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
145k
          for( int y = 0; y < numCheckLines; y++ )
1309
143k
          {
1310
151k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
143k
            {
1312
143k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
143k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
135k
                return false;
1315
143k
            }
1316
143k
          }
1317
138k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
2.17k
        if( checkReadyState )
1325
1.08k
          return true;
1326
1327
1.08k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
1.08k
        if( slice.sps->ccalfEnabled )
1331
1.08k
        {
1332
1.08k
          if( ctuRsAddr == deriveFilterCtu )
1333
1.08k
          {
1334
1.08k
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
1.08k
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
1.08k
        }
1346
1.08k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
1.08k
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
1.08k
      }
1350
0
      break;
1351
1352
10.7k
    case CCALF_RECONSTRUCT:
1353
10.7k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
10.7k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
10.7k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
6.94k
          return false;
1358
1359
3.80k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
3.80k
        if( checkReadyState )
1368
1.90k
          return true;
1369
1370
1.90k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
1.90k
        if( slice.sps->ccalfEnabled )
1373
1.90k
        {
1374
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
5.22k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
3.32k
          {
1377
3.32k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
3.32k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
3.32k
          }
1380
1.90k
        }
1381
1382
1.90k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
1.90k
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
1.90k
        {
1388
1.90k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
1.90k
          const int margin = cs.picture->margin;
1390
1.90k
          recoBuf.extendBorderPelLft( y, height, margin );
1391
1.90k
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
1.90k
          if(ctuPosY == 0)
1393
1.08k
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
1.90k
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
1.08k
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
1.90k
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
1.90k
        }
1401
1402
        // perform finish only once for whole picture
1403
1.90k
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
1.90k
        if( ctuRsAddr < finishCtu )
1405
816
        {
1406
816
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
816
          return true;
1409
816
        }
1410
1.08k
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
1.08k
      }
1412
1413
32.5k
    case FINISH_SLICE:
1414
32.5k
      {
1415
32.5k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
68.7k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
67.1k
          if( processStates[ i ] < FINISH_SLICE )
1420
30.9k
            return false;
1421
1422
1.61k
        if( checkReadyState )
1423
524
          return true;
1424
1425
1.08k
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
1.08k
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
1.08k
        return true;
1430
1.61k
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
49.4M
  }
1440
1441
17.8k
  return false;
1442
49.4M
}
bool vvenc::EncSlice::xProcessCtuTask<false>(int, void*)
Line
Count
Source
883
22.4k
{
884
22.4k
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
22.4k
  Picture* pic                   = ctuEncParam->pic;
886
22.4k
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
22.4k
  CodingStructure& cs            = *pic->cs;
888
22.4k
  Slice&           slice         = *cs.slice;
889
22.4k
  const PPS&       pps           = *slice.pps;
890
22.4k
  const PreCalcValues& pcv       = *cs.pcv;
891
22.4k
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
22.4k
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
22.4k
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
22.4k
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
22.4k
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
22.4k
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
22.4k
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
22.4k
  const int ctuStride            = pcv.widthInCtus;
899
22.4k
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
22.4k
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
22.4k
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
22.4k
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
22.4k
  const TaskType currState       = processStates[ ctuRsAddr ];
904
22.4k
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
22.4k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
22.4k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
22.4k
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
22.4k
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
22.4k
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
22.4k
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
0
    return false;
916
917
22.4k
  switch( currState )
918
22.4k
  {
919
    // encode
920
3.33k
    case CTU_ENCODE:
921
3.33k
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
3.33k
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
3.33k
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
3.33k
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
0
          return false;
937
3.33k
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
0
          return false;
939
        
940
3.33k
        if( checkReadyState )
941
0
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
3.33k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
3.33k
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
3.33k
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
3.33k
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
3.33k
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
3.33k
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
3.33k
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
1.90k
        {
960
1.90k
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
1.90k
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
1.90k
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
1.90k
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
1.90k
        }
965
966
3.33k
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
3.33k
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
3.33k
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
3.33k
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
3.33k
    case RESHAPE_LF_VER:
975
3.33k
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
3.33k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
3.33k
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
3.33k
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
3.33k
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
3.33k
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
6.66k
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
3.33k
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
0
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
3.33k
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
3.33k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
0
          return false;
1005
1006
3.33k
        if( checkReadyState )
1007
0
          return true;
1008
1009
3.33k
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
3.33k
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
3.33k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
3.33k
        {
1023
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
3.33k
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
3.33k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
3.33k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
3.33k
        }
1032
1033
3.33k
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
3.33k
        processStates[ ctuRsAddr ] = LF_HOR;
1036
3.33k
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
3.33k
    case LF_HOR:
1041
3.33k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
3.33k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
0
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
3.33k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
0
          return false;
1051
1052
3.33k
        if( checkReadyState )
1053
0
          return true;
1054
1055
3.33k
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
3.33k
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
3.33k
        {
1059
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
3.33k
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
3.33k
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
3.33k
        }
1064
1065
3.33k
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
3.33k
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
3.33k
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
3.33k
    case SAO_FILTER:
1073
3.33k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
3.33k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
3.33k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
3.33k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
3.33k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
3.33k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
3.33k
        if( checkReadyState )
1086
0
          return true;
1087
1088
3.33k
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
3.33k
        if( slice.sps->saoEnabled && pic->useSAO )
1092
3.33k
        {
1093
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
3.33k
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
3.33k
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
3.33k
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
3.33k
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
3.33k
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
3.33k
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
3.33k
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
3.33k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
3.33k
        }
1104
1105
        // ALF border extension
1106
3.33k
        if( cs.sps->alfEnabled )
1107
3.33k
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
3.33k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
3.33k
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
3.33k
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
3.33k
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
3.33k
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
3.33k
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
3.33k
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
3.33k
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
3.33k
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
3.33k
        }
1122
1123
        // DMVR refinement can be stored now
1124
3.33k
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
3.33k
        {
1126
3.33k
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
3.33k
        }
1128
3.33k
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
3.33k
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
3.33k
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
3.33k
        if( ctuPosX == lastCtuColInTileRow )
1133
1.90k
        {
1134
1.90k
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
1.90k
        }
1136
1.43k
        else
1137
1.43k
        {
1138
1.43k
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
1.43k
          return true;
1140
1.43k
        }
1141
3.33k
      }
1142
1.90k
      break;
1143
1144
1.90k
    case ALF_GET_STATISTICS:
1145
1.90k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
1.90k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
1.90k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
1.90k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
1.90k
        if( checkReadyState )
1153
0
          return true;
1154
1155
1.90k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
1.90k
        if( slice.sps->alfEnabled )
1159
1.90k
        {
1160
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
1.90k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
5.23k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
3.33k
          {
1165
3.33k
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
3.33k
          }
1167
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
1.90k
        }
1169
1170
1.90k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
1.90k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
1.90k
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
1.90k
      }
1176
0
      break;
1177
1178
1.08k
    case ALF_DERIVE_FILTER:
1179
1.08k
      {
1180
1.08k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.08k
        if( ctuRsAddr == deriveFilterCtu )
1182
1.08k
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.08k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
2.98k
          for( int y = 0; y < numCheckLines; y++ )
1186
1.90k
          {
1187
3.80k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
1.90k
            {
1189
1.90k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
1.90k
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
0
                return false;
1192
1.90k
            }
1193
1.90k
          }
1194
1.08k
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
1.08k
        if( checkReadyState )
1202
0
          return true;
1203
1204
1.08k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
1.08k
        if( slice.sps->alfEnabled )
1207
1.08k
        {
1208
1.08k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
1.08k
          if( ctuRsAddr == deriveFilterCtu )
1210
1.08k
          {
1211
1.08k
            encSlice->m_pALF->initDerivation( slice );
1212
1.08k
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
1.08k
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
1.08k
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
1.08k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
1.08k
        }
1228
1229
1.08k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
1.08k
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
1.08k
      }
1232
0
      break;
1233
1234
1.90k
    case ALF_RECONSTRUCT:
1235
1.90k
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
1.90k
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
1.90k
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
0
          return false;
1240
1.90k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
1.90k
        if( checkReadyState )
1249
0
          return true;
1250
1251
1.90k
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
1.90k
        if( slice.sps->alfEnabled )
1254
1.90k
        {
1255
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
5.23k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
3.33k
          {
1259
3.33k
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
3.33k
          }
1261
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
1.90k
        }
1263
1264
1.90k
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
1.90k
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
1.90k
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
2.59k
    case CCALF_GET_STATISTICS:
1271
2.59k
      {
1272
2.59k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
2.12k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
1.90k
        if( checkReadyState )
1276
0
          return true;
1277
1278
1.90k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
1.90k
        if( slice.sps->ccalfEnabled )
1282
1.90k
        {
1283
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
5.23k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
3.33k
          {
1287
3.33k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
3.33k
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
3.33k
          }
1290
1.90k
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
1.90k
        }
1292
1293
1.90k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
1.90k
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
1.90k
      }
1298
0
      break;
1299
1300
1.08k
    case CCALF_DERIVE_FILTER:
1301
1.08k
      {
1302
        // synchronization dependencies
1303
1.08k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
1.08k
        if( ctuRsAddr == deriveFilterCtu )
1305
1.08k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
1.08k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
2.98k
          for( int y = 0; y < numCheckLines; y++ )
1309
1.90k
          {
1310
3.80k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
1.90k
            {
1312
1.90k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
1.90k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
0
                return false;
1315
1.90k
            }
1316
1.90k
          }
1317
1.08k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
1.08k
        if( checkReadyState )
1325
0
          return true;
1326
1327
1.08k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
1.08k
        if( slice.sps->ccalfEnabled )
1331
1.08k
        {
1332
1.08k
          if( ctuRsAddr == deriveFilterCtu )
1333
1.08k
          {
1334
1.08k
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
1.08k
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
1.08k
        }
1346
1.08k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
1.08k
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
1.08k
      }
1350
0
      break;
1351
1352
1.90k
    case CCALF_RECONSTRUCT:
1353
1.90k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
1.90k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
1.90k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
0
          return false;
1358
1359
1.90k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
1.90k
        if( checkReadyState )
1368
0
          return true;
1369
1370
1.90k
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
1.90k
        if( slice.sps->ccalfEnabled )
1373
1.90k
        {
1374
1.90k
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
5.22k
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
3.32k
          {
1377
3.32k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
3.32k
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
3.32k
          }
1380
1.90k
        }
1381
1382
1.90k
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
1.90k
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
1.90k
        {
1388
1.90k
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
1.90k
          const int margin = cs.picture->margin;
1390
1.90k
          recoBuf.extendBorderPelLft( y, height, margin );
1391
1.90k
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
1.90k
          if(ctuPosY == 0)
1393
1.08k
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
1.90k
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
1.08k
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
1.90k
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
1.90k
        }
1401
1402
        // perform finish only once for whole picture
1403
1.90k
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
1.90k
        if( ctuRsAddr < finishCtu )
1405
816
        {
1406
816
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
816
          return true;
1409
816
        }
1410
1.08k
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
1.08k
      }
1412
1413
1.61k
    case FINISH_SLICE:
1414
1.61k
      {
1415
1.61k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
4.28k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
3.20k
          if( processStates[ i ] < FINISH_SLICE )
1420
524
            return false;
1421
1422
1.08k
        if( checkReadyState )
1423
0
          return true;
1424
1425
1.08k
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
1.08k
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
1.08k
        return true;
1430
1.08k
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
22.4k
  }
1440
1441
17.8k
  return false;
1442
22.4k
}
bool vvenc::EncSlice::xProcessCtuTask<true>(int, void*)
Line
Count
Source
883
106M
{
884
106M
  CtuEncParam* ctuEncParam       = static_cast<CtuEncParam*>( taskParam );
885
106M
  Picture* pic                   = ctuEncParam->pic;
886
106M
  EncSlice* encSlice             = ctuEncParam->encSlice;
887
106M
  CodingStructure& cs            = *pic->cs;
888
106M
  Slice&           slice         = *cs.slice;
889
106M
  const PPS&       pps           = *slice.pps;
890
106M
  const PreCalcValues& pcv       = *cs.pcv;
891
106M
  const int ctuRsAddr            = ctuEncParam->ctuRsAddr;
892
106M
  const int ctuPosX              = ctuEncParam->ctuPosX;
893
106M
  const int ctuPosY              = ctuEncParam->ctuPosY;
894
106M
  const int x                    = ctuPosX << pcv.maxCUSizeLog2;
895
106M
  const int y                    = ctuPosY << pcv.maxCUSizeLog2;
896
106M
  const int width                = std::min( pcv.maxCUSize, pcv.lumaWidth  - x );
897
106M
  const int height               = std::min( pcv.maxCUSize, pcv.lumaHeight - y );
898
106M
  const int ctuStride            = pcv.widthInCtus;
899
106M
  const int lineIdx              = ctuEncParam->tileLineResIdx;
900
106M
  ProcessCtuState* processStates = encSlice->m_processStates.data();
901
106M
  const UnitArea& ctuArea        = ctuEncParam->ctuArea;
902
106M
  const bool wppSyncEnabled      = cs.sps->entropyCodingSyncEnabled;
903
106M
  const TaskType currState       = processStates[ ctuRsAddr ];
904
106M
  const unsigned syncLines       = encSlice->m_pcEncCfg->m_ifpLines;
905
906
106M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "poc", cs.slice->poc ) );
907
106M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
908
106M
  DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", processStates[ ctuRsAddr ] == CTU_ENCODE ? 0 : 1 ) );
909
910
  // process ctu's line wise from left to right
911
106M
  const bool tileParallel = encSlice->m_pcEncCfg->m_tileParallelCtuEnc;
912
106M
  if( tileParallel && currState == CTU_ENCODE && ctuPosX > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) )
913
0
    ; // for CTU_ENCODE on tile boundaries, allow parallel processing of tiles
914
106M
  else if( ctuPosX > 0 && processStates[ ctuRsAddr - 1 ] <= currState && currState < PROCESS_DONE )
915
57.5M
    return false;
916
917
49.4M
  switch( currState )
918
49.4M
  {
919
    // encode
920
24.2M
    case CTU_ENCODE:
921
24.2M
      {
922
        // CTU line-wise inter-frame parallel processing synchronization
923
24.2M
        if( syncLines )
924
0
        {
925
0
          const bool lineStart = ctuPosX == 0 || ( tileParallel && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX - 1, ctuPosY ) );
926
0
          if( lineStart && !refPicCtuLineReady( slice, ctuPosY + (int)syncLines, pcv ) )
927
0
          {
928
0
            return false;
929
0
          }
930
0
        }
931
932
        // general wpp conditions, top and top-right ctu have to be encoded
933
24.2M
        if( encSlice->m_pcEncCfg->m_tileParallelCtuEnc && ctuPosY > 0 && slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX, ctuPosY - 1 ) )
934
0
          ; // allow parallel processing of CTU-encoding on independent tiles
935
24.2M
        else if( ctuPosY > 0                                  && processStates[ ctuRsAddr - ctuStride     ] <= CTU_ENCODE )
936
20.0M
          return false;
937
4.26M
        else if( ctuPosY > 0 && ctuPosX + 1 < pcv.widthInCtus && processStates[ ctuRsAddr - ctuStride + 1 ] <= CTU_ENCODE && !wppSyncEnabled )
938
4.26M
          return false;
939
        
940
3.31k
        if( checkReadyState )
941
3.32k
          return true;
942
943
#ifdef TRACE_ENABLE_ITT
944
        std::stringstream ss;
945
        ss << "Encode_" << slice.poc << "_CTU_" << ctuPosY << "_" << ctuPosX;
946
        __itt_string_handle* itt_handle_ctuEncode = __itt_string_handle_create( ss.str().c_str() );
947
#endif
948
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_ctuEncode );
949
950
18.4E
        TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
951
18.4E
        PerThreadRsrc* taskRsrc      = encSlice->m_ThreadRsrc[ threadIdx ];
952
18.4E
        EncCu& encCu                 = taskRsrc->m_encCu;
953
954
18.4E
        encCu.setCtuEncRsrc( &lineEncRsrc->m_CABACEstimator, &taskRsrc->m_CtxCache, &lineEncRsrc->m_ReuseUniMv, &lineEncRsrc->m_BlkUniMvInfoBuffer, &lineEncRsrc->m_AffineProfList, &lineEncRsrc->m_CachedBvs );
955
18.4E
        encCu.encodeCtu( pic, lineEncRsrc->m_prevQp, ctuPosX, ctuPosY );
956
957
        // cleanup line memory when last ctu in line done to reduce overall memory consumption
958
18.4E
        if( encSlice->m_pcEncCfg->m_ensureWppBitEqual && ( ctuPosX == pcv.widthInCtus - 1 || slice.pps->getTileIdx( ctuPosX, ctuPosY ) != slice.pps->getTileIdx( ctuPosX + 1, ctuPosY ) ) )
959
0
        {
960
0
          lineEncRsrc->m_AffineProfList    .resetAffineMVList();
961
0
          lineEncRsrc->m_BlkUniMvInfoBuffer.resetUniMvList();
962
0
          lineEncRsrc->m_ReuseUniMv        .resetReusedUniMvs();
963
0
          lineEncRsrc->m_CachedBvs         .resetIbcBvCand();
964
0
        }
965
966
18.4E
        DTRACE_UPDATE( g_trace_ctx, std::make_pair( "final", 1 ) );
967
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_ctuEncode );
968
969
18.4E
        processStates[ ctuRsAddr ] = RESHAPE_LF_VER;
970
18.4E
      }
971
0
      break;
972
973
    // reshape + vertical loopfilter
974
14.1M
    case RESHAPE_LF_VER:
975
14.1M
      {
976
        // clip check to right tile border (CTU_ENCODE pre-processing delay due to IBC)
977
14.1M
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
978
14.1M
        const int lastCtuPosXInTile = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
979
14.1M
        const int checkRight = std::min<int>( encSlice->m_ctuEncDelay, lastCtuPosXInTile - ctuPosX );
980
981
14.1M
        const bool hasTiles = encSlice->m_pcEncCfg->m_tileParallelCtuEnc && slice.pps->getNumTiles() > 1;
982
983
        // need to check line above bcs of tiling, which allows CTU_ENCODE to run independently across tiles
984
14.1M
        if( hasTiles )
985
0
        {
986
0
          if( ctuPosY > 0 )
987
0
          {
988
0
            for( int i = -!!ctuPosX; i <= checkRight; i++ )
989
0
              if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, -1 ) && processStates[ctuRsAddr - ctuStride + i] <= CTU_ENCODE )
990
0
                return false;
991
0
          }
992
0
        }
993
        
994
        // ensure all surrounding ctu's are encoded (intra pred requires non-reshaped and unfiltered residual, IBC requires unfiltered samples too)
995
        // check right with max offset (due to WPP condition above, this implies top-right has been already encoded)
996
21.4M
        for( int i = hasTiles ? -!!ctuPosX : checkRight; i <= checkRight; i++ )
997
14.1M
          if( pps.canFilterCtuBdry( ctuPosX, ctuPosY, i, 0 ) && processStates[ctuRsAddr + i] <= CTU_ENCODE )
998
6.81M
            return false;
999
1000
        // check bottom right with 1 CTU delay (this is only required for intra pred)
1001
        // at the right picture border this will check the bottom CTU
1002
7.30M
        const int checkBottomRight = std::min<int>( 1, lastCtuPosXInTile - ctuPosX );
1003
7.30M
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CTU_ENCODE, checkBottomRight ) ) 
1004
7.30M
          return false;
1005
1006
3.23k
        if( checkReadyState )
1007
3.33k
          return true;
1008
1009
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_rspLfVer );
1010
1011
        // reshape
1012
18.4E
        if( slice.sps->lumaReshapeEnable && slice.picHeader->lmcsEnabled )
1013
0
        {
1014
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_RESHAPER, &cs, CH_L );
1015
0
          PelBuf reco = pic->getRecoBuf( COMP_Y ).subBuf( x, y, width, height );
1016
0
          reco.rspSignal( pic->reshapeData.getInvLUT() );
1017
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1018
0
        }
1019
1020
        // loopfilter
1021
18.4E
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1022
0
        {
1023
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1024
          // calculate filter strengths
1025
0
          encSlice->m_pLoopFilter->calcFilterStrengthsCTU( cs, ctuArea, true );
1026
1027
          // vertical filter
1028
0
          PelUnitBuf reco = cs.picture->getRecoBuf();
1029
0
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_VER>( cs, ctuArea, MAX_NUM_CH, reco );
1030
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1031
0
        }
1032
1033
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_rspLfVer );
1034
1035
18.4E
        processStates[ ctuRsAddr ] = LF_HOR;
1036
18.4E
      }
1037
0
      break;
1038
1039
    // horizontal loopfilter
1040
413k
    case LF_HOR:
1041
413k
      {
1042
        // ensure horizontal ordering (from top to bottom)
1043
413k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR ) )         
1044
138k
          return false;
1045
1046
        // ensure vertical loop filter of neighbor ctu's will not modify current residual
1047
        // check top, top-right and right ctu
1048
        // (top, top-right checked implicitly due to ordering check above)
1049
274k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, RESHAPE_LF_VER ) ) 
1050
271k
          return false;
1051
1052
3.28k
        if( checkReadyState )
1053
3.33k
          return true;
1054
1055
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_lfHor );
1056
1057
18.4E
        if( !cs.pps->deblockingFilterControlPresent || !cs.pps->deblockingFilterDisabled || cs.pps->deblockingFilterOverrideEnabled )
1058
0
        {
1059
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_DEBLOCK_FILTER, &cs, CH_L );
1060
0
          PelUnitBuf reco = cs.picture->getRecoBuf();
1061
0
          encSlice->m_pLoopFilter->xDeblockArea<EDGE_HOR>( cs, ctuArea, MAX_NUM_CH, reco );
1062
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1063
0
        }
1064
1065
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_lfHor );
1066
1067
18.4E
        processStates[ ctuRsAddr ] = SAO_FILTER;
1068
18.4E
      }
1069
0
      break;
1070
1071
    // SAO filter
1072
296k
    case SAO_FILTER:
1073
296k
      {
1074
        // general wpp conditions, top and top-right ctu have to be filtered
1075
296k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1076
239k
        if( checkCtuTaskNbTopRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER, true ) ) return false;
1077
1078
        // ensure loop filter of neighbor ctu's will not modify current residual
1079
        // sao processing dependents on +1 pixel to each side
1080
        // due to wpp condition above, only right, bottom and bottom-right ctu have to be checked
1081
217k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1082
202k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR,    true ) ) return false;
1083
7.24k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, LF_HOR, 1, true ) ) return false;
1084
1085
3.32k
        if( checkReadyState )
1086
3.33k
          return true;
1087
1088
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_sao );
1089
1090
        // SAO filter
1091
18.4E
        if( slice.sps->saoEnabled && pic->useSAO )
1092
0
        {
1093
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_SAO, &cs, CH_L );
1094
0
          TileLineEncRsrc* lineEncRsrc    = encSlice->m_TileLineEncRsrc[ lineIdx ];
1095
0
          PerThreadRsrc* taskRsrc         = encSlice->m_ThreadRsrc[ threadIdx ];
1096
0
          EncSampleAdaptiveOffset& encSao = lineEncRsrc->m_encSao;
1097
1098
0
          encSao.setCtuEncRsrc( &lineEncRsrc->m_SaoCABACEstimator, &taskRsrc->m_CtxCache );
1099
0
          encSao.storeCtuReco( cs, ctuArea, ctuPosX, ctuPosY );
1100
0
          encSao.getCtuStatistics( cs, encSlice->m_saoStatData, ctuArea, ctuRsAddr );
1101
0
          encSao.decideCtuParams( cs, encSlice->m_saoStatData, encSlice->m_saoEnabled, encSlice->m_saoAllDisabled, ctuArea, ctuRsAddr, &encSlice->m_saoReconParams[ 0 ], cs.picture->getSAO() );
1102
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1103
0
        }
1104
1105
        // ALF border extension
1106
18.4E
        if( cs.sps->alfEnabled )
1107
0
        {
1108
          // we have to do some kind of position aware boundary padding
1109
          // it's done here because the conditions are readable
1110
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1111
0
          const int fltSize  = ( MAX_ALF_FILTER_LENGTH + 1 ) >> 1;
1112
0
          const int xL       = ( ctuPosX == 0 )                 ? ( x-fltSize       ) : ( x );
1113
0
          const int xR       = ( ctuPosX+1 == pcv.widthInCtus ) ? ( x+width+fltSize ) : ( x+width );
1114
1115
0
          if( ctuPosX == 0 )                  recoBuf.extendBorderPelLft( y, height, fltSize );
1116
0
          if( ctuPosX+1 == pcv.widthInCtus )  recoBuf.extendBorderPelRgt( y, height, fltSize );
1117
0
          if( ctuPosY == 0 )                  recoBuf.extendBorderPelTop( xL, xR-xL, fltSize );
1118
0
          if( ctuPosY+1 == pcv.heightInCtus ) recoBuf.extendBorderPelBot( xL, xR-xL, fltSize );
1119
1120
0
          encSlice->m_pALF->copyCTUforALF(cs, ctuPosX, ctuPosY);
1121
0
        }
1122
1123
        // DMVR refinement can be stored now
1124
18.4E
        if( slice.sps->DMVR && !slice.picHeader->disDmvrFlag )
1125
0
        {
1126
0
          CS::setRefinedMotionFieldCTU( cs, ctuPosX, ctuPosY );
1127
0
        }
1128
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_sao );
1129
1130
18.4E
        const int tileCol = slice.pps->ctuToTileCol[ctuPosX];
1131
18.4E
        const int lastCtuColInTileRow = slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1132
18.4E
        if( ctuPosX == lastCtuColInTileRow )
1133
0
        {
1134
0
          processStates[ctuRsAddr] = ALF_GET_STATISTICS;
1135
0
        }
1136
18.4E
        else
1137
18.4E
        {
1138
18.4E
          processStates[ctuRsAddr] = PROCESS_DONE;
1139
18.4E
          return true;
1140
18.4E
        }
1141
18.4E
      }
1142
0
      break;
1143
1144
70.6k
    case ALF_GET_STATISTICS:
1145
70.6k
      {
1146
        // ensure all surrounding ctu's are filtered (ALF will use pixels of adjacent CTU's)
1147
        // due to wpp condition above in SAO_FILTER, only right, bottom and bottom-right ctu have to be checked
1148
70.6k
        if( checkCtuTaskNbRgt   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1149
70.6k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1150
1.90k
        if( checkCtuTaskNbBotRgt( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, SAO_FILTER ) ) return false;
1151
1152
1.90k
        if( checkReadyState )
1153
1.90k
          return true;
1154
1155
1
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_stat );
1156
1157
        // ALF pre-processing
1158
1
        if( slice.sps->alfEnabled )
1159
0
        {
1160
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1161
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1162
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1163
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1164
0
          {
1165
0
            encSlice->m_pALF->getStatisticsCTU( *cs.picture, cs, recoBuf, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1166
0
          }
1167
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1168
0
        }
1169
1170
1
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_stat );
1171
1172
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1173
1
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1174
1
        processStates[ctuRsAddr] = (ctuRsAddr < deriveFilterCtu) ? ALF_RECONSTRUCT: ALF_DERIVE_FILTER;
1175
1
      }
1176
0
      break;
1177
1178
1.62M
    case ALF_DERIVE_FILTER:
1179
1.62M
      {
1180
1.62M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1181
1.62M
        if( ctuRsAddr == deriveFilterCtu )
1182
1.62M
        {
1183
          // ensure statistics from all previous ctu's have been collected
1184
1.62M
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1185
1.63M
          for( int y = 0; y < numCheckLines; y++ )
1186
1.63M
          {
1187
1.64M
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1188
1.63M
            {
1189
1.63M
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1190
1.63M
              if( processStates[lastCtuInTileRow] <= ALF_GET_STATISTICS )
1191
1.61M
                return false;
1192
1.63M
            }
1193
1.63M
          }
1194
1.62M
        }
1195
0
        else if( syncLines )
1196
0
        {
1197
          // ALF bitstream coding dependency for the sub-sequent ctu-lines
1198
0
          if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_DERIVE_FILTER ) ) 
1199
0
            return false;
1200
0
        }
1201
1.08k
        if( checkReadyState )
1202
1.08k
          return true;
1203
1204
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_derive );
1205
        // ALF post-processing
1206
0
        if( slice.sps->alfEnabled )
1207
0
        {
1208
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1209
0
          if( ctuRsAddr == deriveFilterCtu )
1210
0
          {
1211
0
            encSlice->m_pALF->initDerivation( slice );
1212
0
            encSlice->m_pALF->deriveFilter( *cs.picture, cs, slice.getLambdas(), deriveFilterCtu + 1 );
1213
0
            encSlice->m_pALF->reconstructCoeffAPSs( cs, cs.slice->alfEnabled[COMP_Y], cs.slice->alfEnabled[COMP_Cb] || cs.slice->alfEnabled[COMP_Cr], false );
1214
0
          }
1215
0
          else if( syncLines )
1216
0
          {
1217
            // in sync lines mode: derive/select filter for the remaining lines
1218
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1219
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1220
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1221
0
            for(int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++)
1222
0
            {
1223
0
              encSlice->m_pALF->selectFilterForCTU( cs, &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, ctu );
1224
0
            }
1225
0
          }
1226
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1227
0
        }
1228
1229
0
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_derive );
1230
0
        processStates[ ctuRsAddr ] = ALF_RECONSTRUCT;
1231
0
      }
1232
0
      break;
1233
1234
8.44M
    case ALF_RECONSTRUCT:
1235
8.44M
      {
1236
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1237
8.44M
        const unsigned deriveFilterCtu = encSlice->m_alfDeriveCtu;
1238
8.44M
        if( processStates[deriveFilterCtu] < ALF_RECONSTRUCT )
1239
8.44M
          return false;
1240
1.89k
        else if( syncLines && ctuRsAddr > deriveFilterCtu && encSlice->m_pALF->getAsuHeightInCtus() > 1 )
1241
0
        {
1242
0
          const int asuHeightInCtus = encSlice->m_pALF->getAsuHeightInCtus();
1243
0
          const int botCtuLineInAsu = std::min( (( ctuPosY & ( ~(asuHeightInCtus - 1) ) ) + asuHeightInCtus - 1), (int)pcv.heightInCtus - 1 );
1244
0
          if( processStates[botCtuLineInAsu * ctuStride + ctuPosX] < ALF_RECONSTRUCT ) 
1245
0
            return false;
1246
0
        }
1247
1248
1.89k
        if( checkReadyState )
1249
1.90k
          return true;
1250
1251
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_alf_recon );
1252
1253
18.4E
        if( slice.sps->alfEnabled )
1254
0
        {
1255
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L );
1256
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1257
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1258
0
          {
1259
0
            encSlice->m_pALF->reconstructCTU_MT( *cs.picture, cs, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1260
0
          }
1261
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1262
0
        }
1263
1264
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_alf_recon );
1265
18.4E
        processStates[ctuRsAddr] = CCALF_GET_STATISTICS;
1266
18.4E
      }
1267
      // dont break, no additional deps, can continue straigt away!
1268
      //break;
1269
1270
4.12k
    case CCALF_GET_STATISTICS:
1271
4.12k
      {
1272
4.12k
        if( checkCtuTaskNbTop   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1273
1.93k
        if( checkCtuTaskNbBot   ( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, ALF_RECONSTRUCT ) ) return false;
1274
1275
683
        if( checkReadyState )
1276
688
          return true;
1277
1278
18.4E
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_stat );
1279
1280
        // ALF pre-processing
1281
18.4E
        if( slice.sps->ccalfEnabled )
1282
0
        {
1283
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_ALF, &cs, CH_L);
1284
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1285
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1286
0
          {
1287
0
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1288
0
            encSlice->m_pALF->deriveStatsForCcAlfFilteringCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1289
0
          }
1290
0
          PROFILER_EXT_ACCUM_AND_START_NEW_SET( 1, _TPROF, P_IGNORE, &cs, CH_L );
1291
0
        }
1292
1293
18.4E
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_stat );
1294
1295
        // start alf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1296
18.4E
        processStates[ctuRsAddr] = (ctuRsAddr < encSlice->m_ccalfDeriveCtu) ? CCALF_RECONSTRUCT: CCALF_DERIVE_FILTER;
1297
18.4E
      }
1298
0
      break;
1299
1300
137k
    case CCALF_DERIVE_FILTER:
1301
137k
      {
1302
        // synchronization dependencies
1303
137k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1304
137k
        if( ctuRsAddr == deriveFilterCtu )
1305
137k
        {
1306
          // ensure statistics from all previous ctu's have been collected
1307
137k
          int numCheckLines = deriveFilterCtu / pcv.widthInCtus + 1;
1308
142k
          for( int y = 0; y < numCheckLines; y++ )
1309
141k
          {
1310
147k
            for( int tileCol = 0; tileCol < slice.pps->numTileCols; tileCol++ )
1311
141k
            {
1312
141k
              const int lastCtuInTileRow = y * pcv.widthInCtus + slice.pps->tileColBd[tileCol] + slice.pps->tileColWidth[tileCol] - 1;
1313
141k
              if( processStates[lastCtuInTileRow] <= CCALF_GET_STATISTICS )
1314
135k
                return false;
1315
141k
            }
1316
141k
          }
1317
137k
        }
1318
0
        else if( syncLines )
1319
0
        {
1320
          // ALF bitstream coding dependency for the sub-sequent CTU-lines
1321
0
          if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT || checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_DERIVE_FILTER ) ) 
1322
0
            return false;
1323
0
        }
1324
1.08k
        if( checkReadyState )
1325
1.08k
          return true;
1326
1327
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_derive );
1328
1329
        // start task
1330
0
        if( slice.sps->ccalfEnabled )
1331
0
        {
1332
0
          if( ctuRsAddr == deriveFilterCtu )
1333
0
          {
1334
0
            encSlice->m_pALF->deriveCcAlfFilter( *cs.picture, cs, encSlice->m_ccalfDeriveCtu + 1 );
1335
0
          }
1336
0
          else if( syncLines )
1337
0
          {
1338
            // in sync lines mode: derive/select filter for the remaining lines
1339
0
            TileLineEncRsrc* lineEncRsrc = encSlice->m_TileLineEncRsrc[ lineIdx ];
1340
0
            PerThreadRsrc*   taskRsrc    = encSlice->m_ThreadRsrc[ threadIdx ];
1341
0
            const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1342
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cb, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1343
0
            encSlice->m_pALF->selectCcAlfFilterForCtuLine( cs, COMP_Cr, cs.getRecoBuf(), &lineEncRsrc->m_AlfCABACEstimator, &taskRsrc->m_CtxCache, firstCtuInRow, ctuRsAddr );
1344
0
          }
1345
0
        }
1346
0
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_derive );
1347
1348
0
        processStates[ctuRsAddr] = CCALF_RECONSTRUCT;
1349
0
      }
1350
0
      break;
1351
1352
8.84k
    case CCALF_RECONSTRUCT:
1353
8.84k
      {
1354
        // start ccalf filter derivation either for a sub-set of CTUs (syncLines mode) or for the whole picture (regular mode)
1355
8.84k
        const unsigned deriveFilterCtu = encSlice->m_ccalfDeriveCtu;
1356
8.84k
        if( processStates[deriveFilterCtu] < CCALF_RECONSTRUCT )
1357
6.94k
          return false;
1358
1359
1.90k
        if( syncLines )
1360
0
        {
1361
          // ensure line-by-line reconstruction due to line synchronization
1362
0
          if( checkCtuTaskNbTop( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_RECONSTRUCT ) ) return false;
1363
          // check bottom due to rec. buffer usage in ccalf statistics
1364
0
          if( checkCtuTaskNbBot( pps, ctuPosX, ctuPosY, ctuRsAddr, processStates, CCALF_GET_STATISTICS ) ) return false;
1365
0
        }
1366
1367
1.90k
        if( checkReadyState )
1368
1.90k
          return true;
1369
1370
0
        ITT_TASKSTART( itt_domain_encode, itt_handle_ccalf_recon );
1371
1372
0
        if( slice.sps->ccalfEnabled )
1373
0
        {
1374
0
          const int firstCtuInRow = ctuRsAddr + 1 - slice.pps->tileColWidth[slice.pps->ctuToTileCol[ctuPosX]];
1375
0
          for( int ctu = firstCtuInRow; ctu <= ctuRsAddr; ctu++ )
1376
0
          {
1377
0
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cb, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1378
0
            encSlice->m_pALF->applyCcAlfFilterCTU( cs, COMP_Cr, ctu, encSlice->m_ThreadRsrc[ threadIdx ]->m_alfTempCtuBuf );
1379
0
          }
1380
0
        }
1381
1382
0
        ITT_TASKEND( itt_domain_encode, itt_handle_ccalf_recon );
1383
1384
        // extend pic border
1385
        // CCALF reconstruction stage is done per tile, ensure that all tiles in current CTU row are done  
1386
0
        if( ++(pic->m_tileColsDone->at(ctuPosY)) >= pps.numTileCols )
1387
0
        {
1388
0
          PelUnitBuf recoBuf = cs.picture->getRecoBuf();
1389
0
          const int margin = cs.picture->margin;
1390
0
          recoBuf.extendBorderPelLft( y, height, margin );
1391
0
          recoBuf.extendBorderPelRgt( y, height, margin );
1392
0
          if(ctuPosY == 0)
1393
0
            recoBuf.extendBorderPelTop( -margin, pcv.lumaWidth + 2 * margin, margin );
1394
0
          if(ctuPosY + 1 == pcv.heightInCtus)
1395
0
            recoBuf.extendBorderPelBot( -margin, pcv.lumaWidth + 2 * margin, margin );
1396
1397
          // for IFP lines synchro, do an additional increment signaling that CTU row is ready
1398
0
          if( syncLines )
1399
0
            ++(pic->m_tileColsDone->at( ctuPosY ));
1400
0
        }
1401
1402
        // perform finish only once for whole picture
1403
0
        const unsigned finishCtu = pcv.sizeInCtus - 1;
1404
0
        if( ctuRsAddr < finishCtu )
1405
0
        {
1406
0
          processStates[ctuRsAddr] = PROCESS_DONE;
1407
          // processing done => terminate thread
1408
0
          return true;
1409
0
        }
1410
0
        processStates[ctuRsAddr] = FINISH_SLICE;
1411
0
      }
1412
1413
30.9k
    case FINISH_SLICE:
1414
30.9k
      {
1415
30.9k
        CHECK( ctuRsAddr != pcv.sizeInCtus - 1, "invalid state, finish slice only once for last ctu" );
1416
1417
        // ensure all coding tasks have been done for all previous ctu's
1418
64.4k
        for( int i = 0; i < ctuRsAddr; i++ )
1419
63.9k
          if( processStates[ i ] < FINISH_SLICE )
1420
30.4k
            return false;
1421
1422
524
        if( checkReadyState )
1423
524
          return true;
1424
1425
0
        encSlice->finishCompressSlice( cs.picture, slice );
1426
1427
0
        processStates[ ctuRsAddr ] = PROCESS_DONE;
1428
        // processing done => terminate thread
1429
0
        return true;
1430
524
      }
1431
1432
0
    case PROCESS_DONE:
1433
0
      CHECK( true, "process state is PROCESS_DONE, but thread is still running" );
1434
0
      return true;
1435
1436
0
    default:
1437
0
      CHECK( true, "unknown process state" );
1438
0
      return true;
1439
49.4M
  }
1440
1441
0
  return false;
1442
49.4M
}
1443
1444
void EncSlice::encodeSliceData( Picture* pic )
1445
1.08k
{
1446
1.08k
  CodingStructure& cs              = *pic->cs;
1447
1.08k
  Slice* const slice               = cs.slice;
1448
1.08k
  const uint32_t startCtuTsAddr    = slice->sliceMap.ctuAddrInSlice[0];
1449
1.08k
  const uint32_t boundingCtuTsAddr = cs.pcv->sizeInCtus;
1450
1.08k
  const bool wavefrontsEnabled     = slice->sps->entropyCodingSyncEnabled;
1451
1452
  // this ensures that independently encoded bitstream chunks can be combined to bit-equal
1453
1.08k
  const SliceType cabacTableIdx = ! slice->pps->cabacInitPresent || slice->pendingRasInit ? slice->sliceType : m_encCABACTableIdx;
1454
1.08k
  slice->encCABACTableIdx = cabacTableIdx;
1455
1456
  // initialise entropy coder for the slice
1457
1.08k
  m_CABACWriter.initCtxModels( *slice );
1458
1459
1.08k
  DTRACE( g_trace_ctx, D_HEADER, "=========== POC: %d ===========\n", slice->poc );
1460
1461
1.08k
  int prevQP[MAX_NUM_CH];
1462
1.08k
  prevQP[0] = prevQP[1] = slice->sliceQp;
1463
1464
1.08k
  const PreCalcValues& pcv        = *cs.pcv;
1465
1.08k
  const uint32_t widthInCtus      = pcv.widthInCtus;
1466
1.08k
  uint32_t uiSubStrm              = 0;
1467
1.08k
  const int numSubstreamsColumns  = slice->pps->numTileCols;
1468
1.08k
  const int numSubstreamRows      = slice->sps->entropyCodingSyncEnabled ? pic->cs->pcv->heightInCtus : slice->pps->numTileRows;
1469
1.08k
  const int numSubstreams         = std::max<int>( numSubstreamRows * numSubstreamsColumns, 0/*(int)pic->brickMap->bricks.size()*/ );
1470
1.08k
  std::vector<OutputBitstream> substreamsOut( numSubstreams );
1471
1472
1.08k
  slice->clearSubstreamSizes();
1473
1474
4.41k
  for( uint32_t ctuTsAddr = startCtuTsAddr; ctuTsAddr < boundingCtuTsAddr; ctuTsAddr++ )
1475
3.33k
  {
1476
3.33k
    const uint32_t ctuRsAddr            = slice->sliceMap.ctuAddrInSlice[ctuTsAddr];
1477
3.33k
    const uint32_t ctuXPosInCtus        = ctuRsAddr % widthInCtus;
1478
3.33k
    const uint32_t ctuYPosInCtus        = ctuRsAddr / widthInCtus;
1479
3.33k
    const uint32_t tileXPosInCtus       = slice->pps->tileColBd[cs.pps->ctuToTileCol[ctuXPosInCtus]];
1480
3.33k
    const uint32_t tileYPosInCtus       = slice->pps->tileRowBd[cs.pps->ctuToTileRow[ctuYPosInCtus]];
1481
1482
3.33k
    DTRACE_UPDATE( g_trace_ctx, std::make_pair( "ctu", ctuRsAddr ) );
1483
1484
3.33k
    const Position pos (ctuXPosInCtus * pcv.maxCUSize, ctuYPosInCtus * pcv.maxCUSize);
1485
3.33k
    const UnitArea ctuArea (cs.area.chromaFormat, Area(pos.x, pos.y, pcv.maxCUSize, pcv.maxCUSize));
1486
3.33k
    CHECK( uiSubStrm >= numSubstreams, "array index out of bounds" );
1487
3.33k
    m_CABACWriter.initBitstream( &substreamsOut[ uiSubStrm ] );
1488
1489
    // set up CABAC contexts' state for this CTU
1490
3.33k
    if (ctuXPosInCtus == tileXPosInCtus && ctuYPosInCtus == tileYPosInCtus )
1491
1.08k
    {
1492
1.08k
      if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset
1493
0
      {
1494
0
        m_CABACWriter.initCtxModels( *slice );
1495
0
      }
1496
1.08k
      prevQP[0] = prevQP[1] = slice->sliceQp;
1497
1.08k
    }
1498
2.24k
    else if (ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled)
1499
0
    {
1500
      // Synchronize cabac probabilities with upper-right CTU if it's available and at the start of a line.
1501
0
      if (ctuTsAddr != startCtuTsAddr) // if it is the first CTU, then the entropy coder has already been reset
1502
0
      {
1503
0
        m_CABACWriter.initCtxModels( *slice );
1504
0
      }
1505
0
      if( cs.getCURestricted( pos.offset( 0, -1 ), pos, slice->independentSliceIdx, slice->pps->getTileIdx( ctuXPosInCtus, ctuYPosInCtus ), CH_L, TREE_D ) )
1506
0
      {
1507
        // Top-right is available, so use it.
1508
0
        m_CABACWriter.getCtx() = m_entropyCodingSyncContextState;
1509
0
      }
1510
0
      prevQP[0] = prevQP[1] = slice->sliceQp;
1511
0
    }
1512
1513
3.33k
    m_CABACWriter.coding_tree_unit( cs, ctuArea, prevQP, ctuRsAddr );
1514
1515
    // store probabilities of second CTU in line into buffer
1516
3.33k
    if( ctuXPosInCtus == tileXPosInCtus && wavefrontsEnabled )
1517
0
    {
1518
0
      m_entropyCodingSyncContextState = m_CABACWriter.getCtx();
1519
0
    }
1520
1521
    // terminate the sub-stream, if required (end of slice-segment, end of tile, end of wavefront-CTU-row):
1522
3.33k
    bool isMoreCTUsinSlice = ctuTsAddr != (boundingCtuTsAddr - 1);
1523
3.33k
    bool isLastCTUinTile   = isMoreCTUsinSlice && slice->pps->getTileIdx( ctuRsAddr ) != slice->pps->getTileIdx( slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] );
1524
3.33k
    bool isLastCTUinWPP    = wavefrontsEnabled && isMoreCTUsinSlice && !isLastCTUinTile && ( (slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus) == cs.pps->tileColBd[cs.pps->ctuToTileCol[slice->sliceMap.ctuAddrInSlice[ctuTsAddr+1] % widthInCtus]] ); //TODO: adjust tile bound condition
1525
1526
3.33k
    if (isLastCTUinWPP || !isMoreCTUsinSlice || isLastCTUinTile )         // this the the last CTU of either tile/brick/WPP/slice
1527
1.08k
    {
1528
1.08k
      m_CABACWriter.end_of_slice();
1529
1530
      // Byte-alignment in slice_data() when new tile
1531
1.08k
      substreamsOut[ uiSubStrm ].writeByteAlignment();
1532
1533
1.08k
      if (isMoreCTUsinSlice) //Byte alignment only when it is not the last substream in the slice
1534
0
      {
1535
        // write sub-stream size
1536
0
        slice->addSubstreamSize( ( substreamsOut[ uiSubStrm ].getNumberOfWrittenBits() >> 3 ) + substreamsOut[ uiSubStrm ].countStartCodeEmulations() );
1537
0
      }
1538
1.08k
      uiSubStrm++;
1539
1.08k
    }
1540
3.33k
  } // CTU-loop
1541
1542
1.08k
  if(slice->pps->cabacInitPresent)
1543
0
  {
1544
0
    m_encCABACTableIdx = m_CABACWriter.getCtxInitId( *slice );
1545
0
  }
1546
1.08k
  else
1547
1.08k
  {
1548
1.08k
    m_encCABACTableIdx = slice->sliceType;
1549
1.08k
  }
1550
1551
  // concatenate substreams
1552
1.08k
  OutputBitstream& outStream = pic->sliceDataStreams[ 0/*slice->sliceIdx*/ ];
1553
2.17k
  for ( int i = 0; i < slice->getNumberOfSubstreamSizes() + 1; i++ )
1554
1.08k
  {
1555
1.08k
    outStream.addSubstream( &(substreamsOut[ i ]) );
1556
1.08k
  }
1557
1.08k
  pic->sliceDataNumBins += m_CABACWriter.getNumBins();
1558
1.08k
}
1559
1560
} // namespace vvenc
1561
1562
//! \}
1563