Coverage Report

Created: 2022-08-24 06:17

/src/x265/source/encoder/frameencoder.cpp
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * Copyright (C) 2013-2020 MulticoreWare, Inc
3
 *
4
 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5
 *          Min Chen <chenm003@163.com>
6
 *          Steve Borho <steve@borho.org>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at license @ x265.com.
24
 *****************************************************************************/
25
26
#include "common.h"
27
#include "frame.h"
28
#include "framedata.h"
29
#include "wavefront.h"
30
#include "param.h"
31
32
#include "encoder.h"
33
#include "frameencoder.h"
34
#include "common.h"
35
#include "slicetype.h"
36
#include "nal.h"
37
38
namespace X265_NS {
39
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
40
41
FrameEncoder::FrameEncoder()
42
3.11k
{
43
3.11k
    m_prevOutputTime = x265_mdate();
44
3.11k
    m_reconfigure = false;
45
3.11k
    m_isFrameEncoder = true;
46
3.11k
    m_threadActive = true;
47
3.11k
    m_slicetypeWaitTime = 0;
48
3.11k
    m_activeWorkerCount = 0;
49
3.11k
    m_completionCount = 0;
50
3.11k
    m_bAllRowsStop = false;
51
3.11k
    m_vbvResetTriggerRow = -1;
52
3.11k
    m_outStreams = NULL;
53
3.11k
    m_backupStreams = NULL;
54
3.11k
    m_substreamSizes = NULL;
55
3.11k
    m_nr = NULL;
56
3.11k
    m_tld = NULL;
57
3.11k
    m_rows = NULL;
58
3.11k
    m_top = NULL;
59
3.11k
    m_param = NULL;
60
3.11k
    m_frame = NULL;
61
3.11k
    m_cuGeoms = NULL;
62
3.11k
    m_ctuGeomMap = NULL;
63
3.11k
    m_localTldIdx = 0;
64
3.11k
    memset(&m_rce, 0, sizeof(RateControlEntry));
65
3.11k
}
66
67
void FrameEncoder::destroy()
68
3.11k
{
69
3.11k
    if (m_pool)
70
3.11k
    {
71
3.11k
        if (!m_jpId)
72
698
        {
73
698
            int numTLD = m_pool->m_numWorkers;
74
698
            if (!m_param->bEnableWavefront)
75
114
                numTLD += m_pool->m_numProviders;
76
23.3k
            for (int i = 0; i < numTLD; i++)
77
22.6k
                m_tld[i].destroy();
78
698
            delete [] m_tld;
79
698
        }
80
3.11k
    }
81
0
    else
82
0
    {
83
0
        m_tld->destroy();
84
0
        delete m_tld;
85
0
    }
86
87
3.11k
    delete[] m_rows;
88
3.11k
    delete[] m_outStreams;
89
3.11k
    delete[] m_backupStreams;
90
3.11k
    X265_FREE(m_sliceBaseRow);
91
3.11k
    X265_FREE(m_sliceMaxBlockRow);
92
3.11k
    X265_FREE(m_cuGeoms);
93
3.11k
    X265_FREE(m_ctuGeomMap);
94
3.11k
    X265_FREE(m_substreamSizes);
95
3.11k
    X265_FREE(m_nr);
96
97
3.11k
    m_frameFilter.destroy();
98
99
3.11k
    if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
100
0
    {
101
0
        delete m_rce.picTimingSEI;
102
0
        delete m_rce.hrdTiming;
103
0
    }
104
3.11k
}
105
106
bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
107
3.11k
{
108
3.11k
    m_top = top;
109
3.11k
    m_param = top->m_param;
110
3.11k
    m_numRows = numRows;
111
3.11k
    m_numCols = numCols;
112
3.11k
    m_reconfigure = false;
113
3.11k
    m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
114
3.11k
                        || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
115
3.11k
                        2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
116
3.11k
    m_filterRowDelayCus = m_filterRowDelay * numCols;
117
3.11k
    m_rows = new CTURow[m_numRows];
118
3.11k
    bool ok = !!m_numRows;
119
120
3.11k
    m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
121
3.11k
    ok &= !!m_sliceBaseRow;
122
3.11k
    m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
123
3.11k
    uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;    
124
3.11k
    uint32_t rowSum = sliceGroupSizeAccu;
125
3.11k
    uint32_t sidx = 0;
126
18.0k
    for (uint32_t i = 0; i < m_numRows; i++)
127
14.8k
    {
128
14.8k
        const uint32_t rowRange = (rowSum >> 8);
129
14.8k
        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
130
0
        {
131
0
            rowSum += sliceGroupSizeAccu;
132
0
            m_sliceBaseRow[++sidx] = i;
133
0
        }
134
14.8k
    }
135
3.11k
    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
136
3.11k
    m_sliceBaseRow[0] = 0;
137
3.11k
    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
138
139
3.11k
    m_sliceMaxBlockRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
140
3.11k
    ok &= !!m_sliceMaxBlockRow;
141
3.11k
    uint32_t maxBlockRows = (m_param->sourceHeight + (16 - 1)) / 16;
142
3.11k
    sliceGroupSizeAccu = (maxBlockRows << 8) / m_param->maxSlices;
143
3.11k
    rowSum = sliceGroupSizeAccu;
144
3.11k
    sidx = 0;
145
37.8k
    for (uint32_t i = 0; i < maxBlockRows; i++)
146
34.7k
    {
147
34.7k
        const uint32_t rowRange = (rowSum >> 8);
148
34.7k
        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
149
0
        {
150
0
            rowSum += sliceGroupSizeAccu;
151
0
            m_sliceMaxBlockRow[++sidx] = i;
152
0
        }
153
34.7k
    }
154
3.11k
    m_sliceMaxBlockRow[0] = 0;
155
3.11k
    m_sliceMaxBlockRow[m_param->maxSlices] = maxBlockRows;
156
157
    /* determine full motion search range */
158
3.11k
    int range  = m_param->searchRange;       /* fpel search */
159
3.11k
    range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
160
3.11k
    range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
161
3.11k
    range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */
162
3.11k
    m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + m_param->maxCUSize - 1) / m_param->maxCUSize);
163
164
    // NOTE: 2 times of numRows because both Encoder and Filter in same queue
165
3.11k
    if (!WaveFront::init(m_numRows * 2))
166
0
    {
167
0
        x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n");
168
0
        m_pool = NULL;
169
0
    }
170
171
3.11k
    m_frameFilter.init(top, this, numRows, numCols);
172
173
    // initialize HRD parameters of SPS
174
3.11k
    if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
175
0
    {
176
0
        m_rce.picTimingSEI = new SEIPictureTiming;
177
0
        m_rce.hrdTiming = new HRDTiming;
178
179
0
        ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
180
0
    }
181
182
3.11k
    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
183
0
        m_nr = X265_MALLOC(NoiseReduction, 1);
184
3.11k
    if (m_nr)
185
0
        memset(m_nr, 0, sizeof(NoiseReduction));
186
3.11k
    else
187
3.11k
        m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
188
189
    // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
190
3.11k
    {
191
3.11k
        unsigned long tmp;
192
3.11k
        CLZ(tmp, (numRows * numCols - 1));
193
3.11k
        m_sliceAddrBits = (uint16_t)(tmp + 1);
194
3.11k
    }
195
196
3.11k
    return ok;
197
3.11k
}
198
199
/* Generate a complete list of unique geom sets for the current picture dimensions */
200
bool FrameEncoder::initializeGeoms()
201
698
{
202
    /* Geoms only vary between CTUs in the presence of picture edges */
203
698
    int maxCUSize = m_param->maxCUSize;
204
698
    int minCUSize = m_param->minCUSize;
205
698
    int heightRem = m_param->sourceHeight & (maxCUSize - 1);
206
698
    int widthRem = m_param->sourceWidth & (maxCUSize - 1);
207
698
    int allocGeoms = 1; // body
208
698
    if (heightRem && widthRem)
209
355
        allocGeoms = 4; // body, right, bottom, corner
210
343
    else if (heightRem || widthRem)
211
226
        allocGeoms = 2; // body, right or bottom
212
213
698
    m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols);
214
698
    m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS);
215
698
    if (!m_cuGeoms || !m_ctuGeomMap)
216
0
        return false;
217
218
    // body
219
698
    CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, minCUSize, m_cuGeoms);
220
698
    memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
221
698
    if (allocGeoms == 1)
222
117
        return true;
223
224
581
    int countGeoms = 1;
225
581
    if (widthRem)
226
471
    {
227
        // right
228
471
        CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
229
2.06k
        for (uint32_t i = 0; i < m_numRows; i++)
230
1.59k
        {
231
1.59k
            uint32_t ctuAddr = m_numCols * (i + 1) - 1;
232
1.59k
            m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
233
1.59k
        }
234
471
        countGeoms++;
235
471
    }
236
581
    if (heightRem)
237
465
    {
238
        // bottom
239
465
        CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
240
2.14k
        for (uint32_t i = 0; i < m_numCols; i++)
241
1.67k
        {
242
1.67k
            uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
243
1.67k
            m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
244
1.67k
        }
245
465
        countGeoms++;
246
247
465
        if (widthRem)
248
355
        {
249
            // corner
250
355
            CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
251
252
355
            uint32_t ctuAddr = m_numCols * m_numRows - 1;
253
355
            m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
254
355
            countGeoms++;
255
355
        }
256
465
        X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
257
465
    }
258
259
581
    return true;
260
698
}
261
262
bool FrameEncoder::startCompressFrame(Frame* curFrame)
263
698
{
264
698
    m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
265
698
    m_frame = curFrame;
266
698
    m_sliceType = curFrame->m_lowres.sliceType;
267
698
    curFrame->m_encData->m_frameEncoderID = m_jpId;
268
698
    curFrame->m_encData->m_jobProvider = this;
269
698
    curFrame->m_encData->m_slice->m_mref = m_mref;
270
271
698
    if (!m_cuGeoms)
272
698
    {
273
698
        if (!initializeGeoms())
274
0
            return false;
275
698
    }
276
277
698
    m_enable.trigger();
278
698
    return true;
279
698
}
280
281
void FrameEncoder::threadMain()
282
3.11k
{
283
3.11k
    THREAD_NAME("Frame", m_jpId);
284
285
3.11k
    if (m_pool)
286
3.11k
    {
287
3.11k
        m_pool->setCurrentThreadAffinity();
288
289
        /* the first FE on each NUMA node is responsible for allocating thread
290
         * local data for all worker threads in that pool. If WPP is disabled, then
291
         * each FE also needs a TLD instance */
292
3.11k
        if (!m_jpId)
293
698
        {
294
698
            int numTLD = m_pool->m_numWorkers;
295
698
            if (!m_param->bEnableWavefront)
296
114
                numTLD += m_pool->m_numProviders;
297
298
698
            m_tld = new ThreadLocalData[numTLD];
299
23.3k
            for (int i = 0; i < numTLD; i++)
300
22.6k
            {
301
22.6k
                m_tld[i].analysis.initSearch(*m_param, m_top->m_scalingList);
302
22.6k
                m_tld[i].analysis.create(m_tld);
303
22.6k
            }
304
305
4.51k
            for (int i = 0; i < m_pool->m_numProviders; i++)
306
3.81k
            {
307
3.81k
                if (m_pool->m_jpTable[i]->m_isFrameEncoder) /* ugh; over-allocation and other issues here */
308
3.11k
                {
309
3.11k
                    FrameEncoder *peer = dynamic_cast<FrameEncoder*>(m_pool->m_jpTable[i]);
310
3.11k
                    peer->m_tld = m_tld;
311
3.11k
                }
312
3.81k
            }
313
698
        }
314
315
3.11k
        if (m_param->bEnableWavefront)
316
2.92k
            m_localTldIdx = -1; // cause exception if used
317
199
        else
318
199
            m_localTldIdx = m_pool->m_numWorkers + m_jpId;
319
3.11k
    }
320
0
    else
321
0
    {
322
0
        m_tld = new ThreadLocalData;
323
0
        m_tld->analysis.initSearch(*m_param, m_top->m_scalingList);
324
0
        m_tld->analysis.create(NULL);
325
0
        m_localTldIdx = 0;
326
0
    }
327
328
3.11k
    m_done.trigger();     /* signal that thread is initialized */
329
3.11k
    m_enable.wait();      /* Encoder::encode() triggers this event */
330
331
3.81k
    while (m_threadActive)
332
698
    {
333
698
        if (m_param->bCTUInfo)
334
0
        {
335
0
            while (!m_frame->m_ctuInfo)
336
0
                m_frame->m_copied.wait();
337
0
        }
338
698
        if ((m_param->bAnalysisType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
339
0
        {
340
0
            while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
341
0
                m_frame->m_copyMVType.wait();
342
0
        }
343
698
        compressFrame();
344
698
        m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
345
698
        m_enable.wait();
346
698
    }
347
3.11k
}
348
349
void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */)
350
0
{
351
0
    Frame* frame = master.m_frame;
352
0
    weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
353
0
}
354
355
356
uint32_t getBsLength( int32_t code )
357
0
{
358
0
    uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1;
359
360
0
    ++ucode;
361
0
    unsigned long idx;
362
0
    CLZ( idx, ucode );
363
0
    uint32_t length = (uint32_t)idx * 2 + 1;
364
365
0
    return length;
366
0
}
367
368
bool FrameEncoder::writeToneMapInfo(x265_sei_payload *payload)
369
0
{
370
0
    bool payloadChange = false;
371
0
    if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
372
0
    {
373
0
        if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
374
0
            payloadChange = true;
375
0
    }
376
0
    else
377
0
    {
378
0
        payloadChange = true;
379
0
        if (m_top->m_prevTonemapPayload.payload != NULL)
380
0
            x265_free(m_top->m_prevTonemapPayload.payload);
381
0
        m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t)* payload->payloadSize);
382
0
    }
383
384
0
    if (payloadChange)
385
0
    {
386
0
        m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
387
0
        m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
388
0
        memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
389
0
    }
390
391
0
    bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
392
0
    return (payloadChange || isIDR);
393
0
}
394
395
void FrameEncoder::writeTrailingSEIMessages()
396
0
{
397
0
    Slice* slice = m_frame->m_encData->m_slice;
398
0
    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
399
0
    int32_t payloadSize = 0;
400
401
0
    if (m_param->decodedPictureHashSEI == 1)
402
0
    {
403
0
        m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
404
0
        for (int i = 0; i < planes; i++)
405
0
            MD5Final(&m_seiReconPictureDigest.m_state[i], m_seiReconPictureDigest.m_digest[i]);
406
0
        payloadSize = 1 + 16 * planes;
407
0
    }
408
0
    else if (m_param->decodedPictureHashSEI == 2)
409
0
    {
410
0
        m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
411
0
        for (int i = 0; i < planes; i++)
412
0
            crcFinish(m_seiReconPictureDigest.m_crc[i], m_seiReconPictureDigest.m_digest[i]);
413
0
        payloadSize = 1 + 2 * planes;
414
0
    }
415
0
    else if (m_param->decodedPictureHashSEI == 3)
416
0
    {
417
0
        m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
418
0
        for (int i = 0; i < planes; i++)
419
0
            checksumFinish(m_seiReconPictureDigest.m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
420
0
        payloadSize = 1 + 4 * planes;
421
0
    }
422
423
0
    m_seiReconPictureDigest.setSize(payloadSize);
424
0
    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
425
0
}
426
427
void FrameEncoder::compressFrame()
428
698
{
429
698
    ProfileScopeEvent(frameThread);
430
431
698
    m_startCompressTime = x265_mdate();
432
698
    m_totalActiveWorkerCount = 0;
433
698
    m_activeWorkerCountSamples = 0;
434
698
    m_totalWorkerElapsedTime = 0;
435
698
    m_totalNoWorkerTime = 0;
436
698
    m_countRowBlocks = 0;
437
698
    m_allRowsAvailableTime = 0;
438
698
    m_stallStartTime = 0;
439
440
698
    m_completionCount = 0;
441
698
    m_bAllRowsStop = false;
442
698
    m_vbvResetTriggerRow = -1;
443
698
    m_rowSliceTotalBits[0] = 0;
444
698
    m_rowSliceTotalBits[1] = 0;
445
446
698
    m_SSDY = m_SSDU = m_SSDV = 0;
447
698
    m_ssim = 0;
448
698
    m_ssimCnt = 0;
449
698
    memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
450
451
698
    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
452
0
    {
453
0
        int height = m_frame->m_fencPic->m_picHeight;
454
0
        int width = m_frame->m_fencPic->m_picWidth;
455
0
        intptr_t stride = m_frame->m_fencPic->m_stride;
456
457
0
        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg[0], NULL, stride, height, width, false, 1))
458
0
        {
459
0
            x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
460
0
        }
461
0
    }
462
463
    /* Emit access unit delimiter unless this is the first frame and the user is
464
     * not repeating headers (since AUD is supposed to be the first NAL in the access
465
     * unit) */
466
698
    Slice* slice = m_frame->m_encData->m_slice;
467
468
698
    if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
469
0
    {
470
0
        m_bs.resetBits();
471
0
        m_entropyCoder.setBitstream(&m_bs);
472
0
        m_entropyCoder.codeAUD(*slice);
473
0
        m_bs.writeByteAlignment();
474
0
        m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
475
0
        if (m_param->bSingleSeiNal)
476
0
            m_bs.resetBits();
477
0
    }
478
698
    if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
479
698
    {
480
698
        if (m_param->bOptRefListLengthPPS)
481
0
        {
482
0
            ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
483
0
            m_top->updateRefIdx();
484
0
        }
485
698
        if (m_top->m_param->rc.bStatRead  && m_top->m_param->bMultiPassOptRPS)
486
0
        {
487
0
            ScopedLock refIdxLock(m_top->m_rpsInSpsLock);
488
0
            if (!m_top->computeSPSRPSIndex())
489
0
            {
490
0
                x265_log(m_param, X265_LOG_ERROR, "compute commonly RPS failed!\n");
491
0
                m_top->m_aborted = true;
492
0
            }
493
0
            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
494
0
        }
495
698
        else
496
698
            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
497
698
    }
498
499
698
    if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
500
0
        m_frame->m_encData->m_slice->m_rpsIdx = (m_top->m_rateControl->m_rce2Pass + m_frame->m_encodeOrder)->rpsIdx;
501
502
    // Weighted Prediction parameters estimation.
503
698
    bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
504
698
    bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
505
506
698
    WeightParam* reuseWP = NULL;
507
698
    if (m_param->analysisLoad && (bUseWeightP || bUseWeightB))
508
0
        reuseWP = (WeightParam*)m_frame->m_analysisData.wt;
509
510
698
    if (bUseWeightP || bUseWeightB)
511
0
    {
512
#if DETAILED_CU_STATS
513
        m_cuStats.countWeightAnalyze++;
514
        ScopedElapsedTime time(m_cuStats.weightAnalyzeTime);
515
#endif
516
0
        if (m_param->analysisLoad)
517
0
        {
518
0
            for (int list = 0; list < slice->isInterB() + 1; list++) 
519
0
            {
520
0
                for (int plane = 0; plane < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
521
0
                {
522
0
                    for (int ref = 1; ref < slice->m_numRefIdx[list]; ref++)
523
0
                        SET_WEIGHT(slice->m_weightPredTable[list][ref][plane], false, 1 << reuseWP->log2WeightDenom, reuseWP->log2WeightDenom, 0);
524
0
                    slice->m_weightPredTable[list][0][plane] = *(reuseWP++);
525
0
                }
526
0
            }
527
0
        }
528
0
        else
529
0
        {
530
0
            WeightAnalysis wa(*this);
531
0
            if (m_pool && wa.tryBondPeers(*this, 1))
532
                /* use an idle worker for weight analysis */
533
0
                wa.waitForExit();
534
0
            else
535
0
                weightAnalyse(*slice, *m_frame, *m_param);
536
0
        }
537
0
    }
538
698
    else
539
698
        slice->disableWeights();
540
541
698
    if (m_param->analysisSave && (bUseWeightP || bUseWeightB))
542
0
        reuseWP = (WeightParam*)m_frame->m_analysisData.wt;
543
    // Generate motion references
544
698
    int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
545
698
    for (int l = 0; l < numPredDir; l++)
546
0
    {
547
0
        for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
548
0
        {
549
0
            WeightParam *w = NULL;
550
0
            if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].wtPresent)
551
0
                w = slice->m_weightPredTable[l][ref];
552
0
            slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
553
0
            m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
554
0
        }
555
0
        if (m_param->analysisSave && (bUseWeightP || bUseWeightB))
556
0
        {
557
0
            for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
558
0
                *(reuseWP++) = slice->m_weightPredTable[l][0][i];
559
0
        }
560
561
0
    }
562
563
698
    int numTLD;
564
698
    if (m_pool)
565
698
        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
566
0
    else
567
0
        numTLD = 1;
568
569
    /* Get the QP for this frame from rate control. This call may block until
570
     * frames ahead of it in encode order have called rateControlEnd() */
571
698
    int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
572
698
    m_rce.newQp = qp;
573
574
698
    if (m_nr)
575
0
    {
576
0
        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
577
0
        {
578
0
            for (int i = 0; i < numTLD; i++)
579
0
            {
580
0
                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
581
0
                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
582
0
                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
583
0
            }
584
0
        }
585
0
        else
586
0
        {
587
0
            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
588
0
            {
589
0
                for (int i = 0; i < numTLD; i++)
590
0
                {
591
0
                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
592
0
                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
593
0
                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
594
0
                }
595
0
            }
596
0
            else
597
0
            {
598
0
                for (int i = 0; i < numTLD; i++)
599
0
                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
600
0
            }
601
0
        }
602
0
    }
603
604
    /* Clip slice QP to 0-51 spec range before encoding */
605
698
    slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
606
698
    if (m_param->bHDR10Opt)
607
0
    {
608
0
        int qpCb = x265_clip3(-12, 0, (int)floor((m_top->m_cB * ((-.46) * qp + 9.26)) + 0.5 ));
609
0
        int qpCr = x265_clip3(-12, 0, (int)floor((m_top->m_cR * ((-.46) * qp + 9.26)) + 0.5 ));
610
0
        slice->m_chromaQpOffset[0] = slice->m_pps->chromaQpOffset[0] + qpCb < -12 ? (qpCb + (-12 - (slice->m_pps->chromaQpOffset[0] + qpCb))) : qpCb;
611
0
        slice->m_chromaQpOffset[1] = slice->m_pps->chromaQpOffset[1] + qpCr < -12 ? (qpCr + (-12 - (slice->m_pps->chromaQpOffset[1] + qpCr))) : qpCr;
612
0
    }
613
614
698
    if (m_param->bOptQpPPS && m_param->bRepeatHeaders)
615
0
    {
616
0
        ScopedLock qpLock(m_top->m_sliceQpLock);
617
0
        for (int i = 0; i < (QP_MAX_MAX + 1); i++)
618
0
        {
619
0
            int delta = slice->m_sliceQp - (i + 1);
620
0
            int codeLength = getBsLength( delta );
621
0
            m_top->m_iBitsCostSum[i] += codeLength;
622
0
        }
623
0
        m_top->m_iFrameNum++;
624
0
    }
625
698
    m_initSliceContext.resetEntropy(*slice);
626
627
698
    m_frameFilter.start(m_frame, m_initSliceContext);
628
629
    /* ensure all rows are blocked prior to initializing row CTU counters */
630
698
    WaveFront::clearEnabledRowMask();
631
632
    /* reset entropy coders and compute slice id */
633
698
    m_entropyCoder.load(m_initSliceContext);
634
1.39k
    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)   
635
3.87k
        for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
636
3.17k
            m_rows[row].init(m_initSliceContext, sliceId);   
637
638
    // reset slice counter for rate control update
639
698
    m_sliceCnt = 0;
640
641
698
    uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : m_param->maxSlices;
642
698
    X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1), "Multiple slices without WPP unsupport now!");
643
698
    if (!m_outStreams)
644
698
    {
645
698
        m_outStreams = new Bitstream[numSubstreams];
646
698
        if (!m_param->bEnableWavefront)
647
114
            m_backupStreams = new Bitstream[numSubstreams];
648
698
        m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
649
698
        if (!slice->m_bUseSao)
650
0
        {
651
0
            for (uint32_t i = 0; i < numSubstreams; i++)
652
0
                m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
653
0
        }
654
698
    }
655
0
    else
656
0
    {
657
0
        for (uint32_t i = 0; i < numSubstreams; i++)
658
0
        {
659
0
            m_outStreams[i].resetBits();
660
0
            if (!slice->m_bUseSao)
661
0
                m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
662
0
            else
663
0
                m_rows[i].rowGoOnCoder.setBitstream(NULL);
664
0
        }
665
0
    }
666
667
698
    m_rce.encodeOrder = m_frame->m_encodeOrder;
668
698
    int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
669
670
698
    if (m_frame->m_lowres.bKeyframe)
671
698
    {
672
698
        if (m_param->bEmitHRDSEI)
673
0
        {
674
0
            SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
675
676
            // since the temporal layer HRD is not ready, we assumed it is fixed
677
0
            bpSei->m_auCpbRemovalDelayDelta = 1;
678
0
            bpSei->m_cpbDelayOffset = 0;
679
0
            bpSei->m_dpbDelayOffset = 0;
680
0
            bpSei->m_concatenationFlag = (m_param->bEnableHRDConcatFlag && !m_frame->m_poc) ? true : false;
681
682
            // hrdFullness() calculates the initial CPB removal delay and offset
683
0
            m_top->m_rateControl->hrdFullness(bpSei);
684
0
            bpSei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
685
686
0
            m_top->m_lastBPSEI = m_rce.encodeOrder;
687
0
        }
688
689
698
        if (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI)
690
0
        {
691
            /* Recovery Point SEI require the SPS to be "activated" */
692
0
            SEIRecoveryPoint sei;
693
0
            sei.m_recoveryPocCnt = 0;
694
0
            sei.m_exactMatchingFlag = true;
695
0
            sei.m_brokenLinkFlag = false;
696
0
            sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
697
0
        }
698
698
    }
699
700
698
    if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
701
0
    {
702
0
        SEIPictureTiming *sei = m_rce.picTimingSEI;
703
0
        const VUI *vui = &slice->m_sps->vuiParameters;
704
0
        const HRDInfo *hrd = &vui->hrdParameters;
705
0
        int poc = slice->m_poc;
706
707
0
        if (vui->frameFieldInfoPresentFlag)
708
0
        {
709
0
            if (m_param->interlaceMode > 0)
710
0
            {
711
0
                if( m_param->interlaceMode == 2 )
712
0
                {   
713
                    // m_picStruct should be set to 3 or 4 when field feature is enabled
714
0
                    if (m_param->bField)
715
                        // 3: Top field, bottom field, in that order; 4: Bottom field, top field, in that order
716
0
                        sei->m_picStruct = (slice->m_fieldNum == 1) ? 4 : 3;
717
0
                    else
718
0
                        sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */;
719
0
                }     
720
0
                else if (m_param->interlaceMode == 1)
721
0
                {
722
0
                    if (m_param->bField)
723
0
                        sei->m_picStruct = (slice->m_fieldNum == 1) ? 3: 4;
724
0
                    else
725
0
                        sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
726
0
                }
727
0
            }
728
0
            else if (m_param->bEnableFrameDuplication)
729
0
                sei->m_picStruct = m_frame->m_picStruct;
730
0
            else
731
0
                sei->m_picStruct = m_param->pictureStructure;
732
733
0
            sei->m_sourceScanType = m_param->interlaceMode ? 0 : 1;
734
735
0
            sei->m_duplicateFlag = false;
736
0
        }
737
738
0
        if (vui->hrdParametersPresentFlag)
739
0
        {
740
            // The m_aucpbremoval delay specifies how many clock ticks the
741
            // access unit associated with the picture timing SEI message has to
742
            // wait after removal of the access unit with the most recent
743
            // buffering period SEI message
744
0
            sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - prevBPSEI), (1 << hrd->cpbRemovalDelayLength));
745
0
            sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
746
0
        }
747
748
0
        sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
749
0
    }
750
751
698
    if (m_param->preferredTransferCharacteristics > -1 && slice->isIRAP())
752
0
    {
753
0
        SEIAlternativeTC m_seiAlternativeTC;
754
0
        m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
755
0
        m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
756
0
    }
757
758
    /* Write user SEI */
759
698
    for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
760
0
    {
761
0
        x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
762
0
        if (payload->payloadType == USER_DATA_UNREGISTERED)
763
0
        {
764
0
            SEIuserDataUnregistered sei;
765
0
            sei.m_userData = payload->payload;
766
0
            sei.setSize(payload->payloadSize);
767
0
            sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
768
0
        }
769
0
        else if (payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
770
0
        {
771
0
            bool writeSei = m_param->bDhdr10opt ? writeToneMapInfo(payload) : true;
772
0
            if (writeSei)
773
0
            {
774
0
                SEIuserDataRegistered sei;
775
0
                sei.m_userData = payload->payload;
776
0
                sei.setSize(payload->payloadSize);
777
0
                sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
778
0
            }
779
0
        }
780
0
        else
781
0
            x265_log(m_param, X265_LOG_ERROR, "Unrecognized SEI type\n");
782
0
    }
783
784
698
    bool isSei = ((m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) || m_param->bEmitHRDSEI ||
785
698
                 !!m_param->interlaceMode || (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI) ||
786
698
                   m_frame->m_userSEI.numPayloads);
787
788
698
    if (isSei && m_param->bSingleSeiNal)
789
0
    {
790
0
        m_bs.writeByteAlignment();
791
0
        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
792
0
    }
793
    /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
794
     * tune RateControl parameters for other frames.
795
     * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
796
     * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
797
     * and VBV, unlock only after rateControlUpdateStats of this frame is called */
798
698
    if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
799
698
    {
800
698
        m_top->m_rateControl->m_startEndOrder.incr();
801
802
698
        if (m_rce.encodeOrder < m_param->frameNumThreads - 1)
803
645
            m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
804
698
    }
805
806
698
    if (m_param->bDynamicRefine)
807
0
        computeAvgTrainingData();
808
809
    /* Analyze CTU rows, most of the hard work is done here.  Frame is
810
     * compressed in a wave-front pattern if WPP is enabled. Row based loop
811
     * filters runs behind the CTU compression and reconstruction */
812
813
1.39k
    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)    
814
698
        m_rows[m_sliceBaseRow[sliceId]].active = true;
815
    
816
698
    if (m_param->bEnableWavefront)
817
584
    {
818
584
        int i = 0;
819
3.40k
        for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
820
2.82k
        {
821
5.64k
            for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
822
2.82k
            {
823
2.82k
                const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
824
2.82k
                const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
825
2.82k
                const uint32_t row = sliceStartRow + rowInSlice;
826
2.82k
                if (row > sliceEndRow)
827
0
                    continue;
828
2.82k
                m_row_to_idx[row] = i;
829
2.82k
                m_idx_to_row[i] = row;
830
2.82k
                i += 1;
831
2.82k
            }
832
2.82k
        }
833
584
    }
834
835
698
    if (m_param->bEnableWavefront)
836
584
    {
837
3.40k
        for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
838
2.82k
        {
839
5.64k
            for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
840
2.82k
            {
841
2.82k
                const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
842
2.82k
                const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
843
2.82k
                const uint32_t row = sliceStartRow + rowInSlice;
844
845
2.82k
                X265_CHECK(row < m_numRows, "slices row fault was detected");
846
847
2.82k
                if (row > sliceEndRow)
848
0
                    continue;
849
850
                // block until all reference frames have reconstructed the rows we need
851
2.82k
                for (int l = 0; l < numPredDir; l++)
852
0
                {
853
0
                    for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
854
0
                    {
855
0
                        Frame *refpic = slice->m_refFrameList[l][ref];
856
857
                        // NOTE: we unnecessary wait row that beyond current slice boundary
858
0
                        const int rowIdx = X265_MIN(sliceEndRow, (row + m_refLagRows));
859
860
0
                        while (refpic->m_reconRowFlag[rowIdx].get() == 0)
861
0
                            refpic->m_reconRowFlag[rowIdx].waitForChange(0);
862
863
0
                        if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
864
0
                            m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
865
0
                    }
866
0
                }
867
868
2.82k
                enableRowEncoder(m_row_to_idx[row]); /* clear external dependency for this row */
869
2.82k
                if (!rowInSlice)
870
584
                {
871
584
                    m_row0WaitTime = x265_mdate();
872
584
                    enqueueRowEncoder(m_row_to_idx[row]); /* clear internal dependency, start wavefront */
873
584
                }
874
2.82k
                tryWakeOne();
875
2.82k
            } // end of loop rowInSlice
876
2.82k
        } // end of loop sliceId
877
878
584
        m_allRowsAvailableTime = x265_mdate();
879
584
        tryWakeOne(); /* ensure one thread is active or help-wanted flag is set prior to blocking */
880
584
        static const int block_ms = 250;
881
584
        while (m_completionEvent.timedWait(block_ms))
882
0
            tryWakeOne();
883
584
    }
884
114
    else
885
114
    {
886
577
        for (uint32_t i = 0; i < m_numRows + m_filterRowDelay; i++)
887
463
        {
888
            // compress
889
463
            if (i < m_numRows)
890
349
            {
891
                // block until all reference frames have reconstructed the rows we need
892
349
                for (int l = 0; l < numPredDir; l++)
893
0
                {
894
0
                    int list = l;
895
0
                    for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
896
0
                    {
897
0
                        Frame *refpic = slice->m_refFrameList[list][ref];
898
899
0
                        const int rowIdx = X265_MIN(m_numRows - 1, (i + m_refLagRows));
900
0
                        while (refpic->m_reconRowFlag[rowIdx].get() == 0)
901
0
                            refpic->m_reconRowFlag[rowIdx].waitForChange(0);
902
903
0
                        if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
904
0
                            m_mref[list][ref].applyWeight(rowIdx, m_numRows, m_numRows, 0);
905
0
                    }
906
0
                }
907
908
349
                if (!i)
909
114
                    m_row0WaitTime = x265_mdate();
910
235
                else if (i == m_numRows - 1)
911
98
                    m_allRowsAvailableTime = x265_mdate();
912
349
                processRowEncoder(i, m_tld[m_localTldIdx]);
913
349
            }
914
915
            // filter
916
463
            if (i >= m_filterRowDelay)
917
349
                m_frameFilter.processRow(i - m_filterRowDelay);
918
463
        }
919
114
    }
920
#if ENABLE_LIBVMAF
921
    vmafFrameLevelScore();
922
#endif
923
924
698
    if (m_param->maxSlices > 1)
925
0
    {
926
0
        PicYuv *reconPic = m_frame->m_reconPic;
927
0
        uint32_t height = reconPic->m_picHeight;
928
0
        initDecodedPictureHashSEI(0, 0, height);
929
0
    } 
930
931
698
    if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
932
0
        collectDynDataFrame();
933
934
698
    if (m_param->rc.bStatWrite)
935
0
    {
936
0
        int totalI = 0, totalP = 0, totalSkip = 0;
937
938
        // accumulate intra,inter,skip cu count per frame for 2 pass
939
0
        for (uint32_t i = 0; i < m_numRows; i++)
940
0
        {
941
0
            m_frame->m_encData->m_frameStats.mvBits    += m_rows[i].rowStats.mvBits;
942
0
            m_frame->m_encData->m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
943
0
            m_frame->m_encData->m_frameStats.miscBits  += m_rows[i].rowStats.miscBits;
944
0
            totalI                                     += m_rows[i].rowStats.intra8x8Cnt;
945
0
            totalP                                     += m_rows[i].rowStats.inter8x8Cnt;
946
0
            totalSkip                                  += m_rows[i].rowStats.skip8x8Cnt;
947
0
        }
948
0
        int totalCuCount = totalI + totalP + totalSkip;
949
0
        m_frame->m_encData->m_frameStats.percent8x8Intra = (double)totalI / totalCuCount;
950
0
        m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount;
951
0
        m_frame->m_encData->m_frameStats.percent8x8Skip  = (double)totalSkip / totalCuCount;
952
0
    }
953
954
698
    if (m_param->csvLogLevel >= 1)
955
0
    {
956
0
        for (uint32_t i = 0; i < m_numRows; i++)
957
0
        {
958
0
            m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN;
959
0
            m_frame->m_encData->m_frameStats.totalCu += m_rows[i].rowStats.totalCu;
960
0
            m_frame->m_encData->m_frameStats.totalCtu += m_rows[i].rowStats.totalCtu;
961
0
            m_frame->m_encData->m_frameStats.lumaDistortion += m_rows[i].rowStats.lumaDistortion;
962
0
            m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
963
0
            m_frame->m_encData->m_frameStats.psyEnergy += m_rows[i].rowStats.psyEnergy;
964
0
            m_frame->m_encData->m_frameStats.ssimEnergy += m_rows[i].rowStats.ssimEnergy;
965
0
            m_frame->m_encData->m_frameStats.resEnergy += m_rows[i].rowStats.resEnergy;
966
0
            for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
967
0
            {
968
0
                m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
969
0
                m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth];
970
0
                for (int m = 0; m < INTER_MODES; m++)
971
0
                    m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m];
972
0
                for (int n = 0; n < INTRA_MODES; n++)
973
0
                    m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n];
974
0
            }
975
0
        }
976
0
        m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
977
978
0
        for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
979
0
        {
980
0
            m_frame->m_encData->m_frameStats.percentSkipCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
981
0
            m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu;
982
0
            for (int n = 0; n < INTRA_MODES; n++)
983
0
                m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu;
984
0
            uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts
985
0
            cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2];
986
0
            m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu;
987
0
            m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu;
988
0
            m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu;
989
0
        }
990
0
    }
991
992
698
    if (m_param->csvLogLevel >= 2)
993
0
    {
994
0
        m_frame->m_encData->m_frameStats.avgLumaDistortion = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
995
0
        m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
996
0
        m_frame->m_encData->m_frameStats.avgPsyEnergy = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
997
0
        m_frame->m_encData->m_frameStats.avgSsimEnergy = (double)(m_frame->m_encData->m_frameStats.ssimEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
998
0
        m_frame->m_encData->m_frameStats.avgResEnergy = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
999
0
    }
1000
1001
698
    m_bs.resetBits();
1002
698
    m_entropyCoder.load(m_initSliceContext);
1003
698
    m_entropyCoder.setBitstream(&m_bs);
1004
1005
    // finish encode of each CTU row, only required when SAO is enabled
1006
698
    if (slice->m_bUseSao)
1007
698
        encodeSlice(0);
1008
1009
698
    m_entropyCoder.setBitstream(&m_bs);
1010
1011
698
    if (m_param->maxSlices > 1)
1012
0
    {
1013
0
        uint32_t nextSliceRow = 0;
1014
1015
0
        for(uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
1016
0
        {
1017
0
            m_bs.resetBits();
1018
1019
0
            const uint32_t sliceAddr = nextSliceRow * m_numCols;
1020
0
            if (m_param->bOptRefListLengthPPS)
1021
0
            {
1022
0
                ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
1023
0
                m_top->analyseRefIdx(slice->m_numRefIdx);
1024
0
            }
1025
0
            m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData, sliceAddr, m_sliceAddrBits, slice->m_sliceQp);
1026
1027
            // Find rows of current slice
1028
0
            const uint32_t prevSliceRow = nextSliceRow;
1029
0
            while(nextSliceRow < m_numRows && m_rows[nextSliceRow].sliceId == sliceId)
1030
0
                nextSliceRow++;
1031
1032
            // serialize each row, record final lengths in slice header
1033
0
            uint32_t maxStreamSize = m_nalList.serializeSubstreams(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow), &m_outStreams[prevSliceRow]);
1034
1035
            // complete the slice header by writing WPP row-starts
1036
0
            m_entropyCoder.setBitstream(&m_bs);
1037
0
            if (slice->m_pps->bEntropyCodingSyncEnabled)
1038
0
                m_entropyCoder.codeSliceHeaderWPPEntryPoints(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow - 1), maxStreamSize);
1039
            
1040
0
            m_bs.writeByteAlignment();
1041
1042
0
            m_nalList.serialize(slice->m_nalUnitType, m_bs);
1043
0
        }
1044
0
    }
1045
698
    else
1046
698
    {
1047
698
        if (m_param->bOptRefListLengthPPS)
1048
0
        {
1049
0
            ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
1050
0
            m_top->analyseRefIdx(slice->m_numRefIdx);
1051
0
        }
1052
698
        m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData, 0, 0, slice->m_sliceQp);
1053
1054
        // serialize each row, record final lengths in slice header
1055
698
        uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams);
1056
1057
        // complete the slice header by writing WPP row-starts
1058
698
        m_entropyCoder.setBitstream(&m_bs);
1059
698
        if (slice->m_pps->bEntropyCodingSyncEnabled)
1060
584
            m_entropyCoder.codeSliceHeaderWPPEntryPoints(m_substreamSizes, (slice->m_sps->numCuInHeight - 1), maxStreamSize);
1061
698
        m_bs.writeByteAlignment();
1062
1063
698
        m_nalList.serialize(slice->m_nalUnitType, m_bs);
1064
698
    }
1065
1066
698
    if (m_param->decodedPictureHashSEI)
1067
0
        writeTrailingSEIMessages();
1068
1069
698
    uint64_t bytes = 0;
1070
3.49k
    for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
1071
2.79k
    {
1072
2.79k
        int type = m_nalList.m_nal[i].type;
1073
1074
        // exclude SEI
1075
2.79k
        if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
1076
2.79k
        {
1077
2.79k
            bytes += m_nalList.m_nal[i].sizeBytes;
1078
            // and exclude start code prefix
1079
2.79k
            bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
1080
2.79k
        }
1081
2.79k
    }
1082
698
    m_accessUnitBits = bytes << 3;
1083
1084
698
    int filler = 0;
1085
    /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
1086
698
    if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &filler) < 0)
1087
0
        m_top->m_aborted = true;
1088
1089
698
    if (filler > 0)
1090
0
    {
1091
0
        filler = (filler - FILLER_OVERHEAD * 8) >> 3;
1092
0
        m_bs.resetBits();
1093
0
        while (filler > 0)
1094
0
        {
1095
0
            m_bs.write(0xff, 8);
1096
0
            filler--;
1097
0
        }
1098
0
        m_bs.writeByteAlignment();
1099
0
        m_nalList.serialize(NAL_UNIT_FILLER_DATA, m_bs);
1100
0
        bytes += m_nalList.m_nal[m_nalList.m_numNal - 1].sizeBytes;
1101
0
        bytes -= 3; //exclude start code prefix
1102
0
        m_accessUnitBits = bytes << 3;
1103
0
    }
1104
1105
698
    if (m_frame->m_rpu.payloadSize)
1106
0
    {
1107
0
        m_bs.resetBits();
1108
0
        for (int i = 0; i < m_frame->m_rpu.payloadSize; i++)
1109
0
            m_bs.write(m_frame->m_rpu.payload[i], 8);
1110
0
        m_nalList.serialize(NAL_UNIT_UNSPECIFIED, m_bs);
1111
0
    }
1112
1113
698
    m_endCompressTime = x265_mdate();
1114
1115
    /* Decrement referenced frame reference counts, allow them to be recycled */
1116
698
    for (int l = 0; l < numPredDir; l++)
1117
0
    {
1118
0
        for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
1119
0
        {
1120
0
            Frame *refpic = slice->m_refFrameList[l][ref];
1121
0
            ATOMIC_DEC(&refpic->m_countRefEncoders);
1122
0
        }
1123
0
    }
1124
1125
698
    if (m_nr)
1126
0
    {
1127
0
        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
1128
1129
0
        if (nrEnabled)
1130
0
        {
1131
            /* Accumulate NR statistics from all worker threads */
1132
0
            for (int i = 0; i < numTLD; i++)
1133
0
            {
1134
0
                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
1135
0
                for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
1136
0
                {
1137
0
                    for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
1138
0
                        m_nr->nrResidualSum[cat][coeff] += nr->nrResidualSum[cat][coeff];
1139
1140
0
                    m_nr->nrCount[cat] += nr->nrCount[cat];
1141
0
                }
1142
0
            }
1143
1144
0
            noiseReductionUpdate();
1145
1146
            /* Copy updated NR coefficients back to all worker threads */
1147
0
            for (int i = 0; i < numTLD; i++)
1148
0
            {
1149
0
                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
1150
0
                memcpy(nr->nrOffsetDenoise, m_nr->nrOffsetDenoise, sizeof(uint16_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
1151
0
                memset(nr->nrCount, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES);
1152
0
                memset(nr->nrResidualSum, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
1153
0
            }
1154
0
        }
1155
0
    }
1156
1157
#if DETAILED_CU_STATS
1158
    /* Accumulate CU statistics from each worker thread, we could report
1159
     * per-frame stats here, but currently we do not. */
1160
    for (int i = 0; i < numTLD; i++)
1161
        m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param);
1162
#endif
1163
1164
698
    m_endFrameTime = x265_mdate();  
1165
698
}
1166
1167
void FrameEncoder::initDecodedPictureHashSEI(int row, int cuAddr, int height)
1168
3.17k
{
1169
3.17k
    PicYuv *reconPic = m_frame->m_reconPic;
1170
3.17k
    uint32_t width = reconPic->m_picWidth;  
1171
3.17k
    intptr_t stride = reconPic->m_stride;
1172
3.17k
    uint32_t maxCUHeight = m_param->maxCUSize;
1173
1174
3.17k
    const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
1175
3.17k
    const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
1176
1177
3.17k
    if (m_param->decodedPictureHashSEI == 1)
1178
0
    {
1179
0
        if (!row)
1180
0
            MD5Init(&m_seiReconPictureDigest.m_state[0]);
1181
1182
0
        updateMD5Plane(m_seiReconPictureDigest.m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
1183
0
        if (m_param->internalCsp != X265_CSP_I400)
1184
0
        {
1185
0
            if (!row)
1186
0
            {
1187
0
                MD5Init(&m_seiReconPictureDigest.m_state[1]);
1188
0
                MD5Init(&m_seiReconPictureDigest.m_state[2]);
1189
0
            }
1190
1191
0
            width >>= hChromaShift;
1192
0
            height >>= vChromaShift;
1193
0
            stride = reconPic->m_strideC;
1194
1195
0
            updateMD5Plane(m_seiReconPictureDigest.m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
1196
0
            updateMD5Plane(m_seiReconPictureDigest.m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
1197
0
        }
1198
0
    }
1199
3.17k
    else if (m_param->decodedPictureHashSEI == 2)
1200
0
    {
1201
1202
0
        if (!row)
1203
0
            m_seiReconPictureDigest.m_crc[0] = 0xffff;
1204
1205
0
        updateCRC(reconPic->getLumaAddr(cuAddr), m_seiReconPictureDigest.m_crc[0], height, width, stride);
1206
0
        if (m_param->internalCsp != X265_CSP_I400)
1207
0
        {
1208
0
            width >>= hChromaShift;
1209
0
            height >>= vChromaShift;
1210
0
            stride = reconPic->m_strideC;
1211
0
            m_seiReconPictureDigest.m_crc[1] = m_seiReconPictureDigest.m_crc[2] = 0xffff;
1212
1213
0
            updateCRC(reconPic->getCbAddr(cuAddr), m_seiReconPictureDigest.m_crc[1], height, width, stride);
1214
0
            updateCRC(reconPic->getCrAddr(cuAddr), m_seiReconPictureDigest.m_crc[2], height, width, stride);
1215
0
        }
1216
0
    }
1217
3.17k
    else if (m_param->decodedPictureHashSEI == 3)
1218
0
    {
1219
0
        if (!row)
1220
0
            m_seiReconPictureDigest.m_checksum[0] = 0;
1221
1222
0
        updateChecksum(reconPic->m_picOrg[0], m_seiReconPictureDigest.m_checksum[0], height, width, stride, row, maxCUHeight);
1223
0
        if (m_param->internalCsp != X265_CSP_I400)
1224
0
        {
1225
0
            width >>= hChromaShift;
1226
0
            height >>= vChromaShift;
1227
0
            stride = reconPic->m_strideC;
1228
0
            maxCUHeight >>= vChromaShift;
1229
1230
0
            if (!row)
1231
0
                m_seiReconPictureDigest.m_checksum[1] = m_seiReconPictureDigest.m_checksum[2] = 0;
1232
1233
0
            updateChecksum(reconPic->m_picOrg[1], m_seiReconPictureDigest.m_checksum[1], height, width, stride, row, maxCUHeight);
1234
0
            updateChecksum(reconPic->m_picOrg[2], m_seiReconPictureDigest.m_checksum[2], height, width, stride, row, maxCUHeight);
1235
0
        }
1236
0
    }
1237
3.17k
}
1238
1239
void FrameEncoder::encodeSlice(uint32_t sliceAddr)
1240
698
{
1241
698
    Slice* slice = m_frame->m_encData->m_slice;
1242
698
    const uint32_t widthInLCUs = slice->m_sps->numCuInWidth;
1243
698
    const uint32_t lastCUAddr = (slice->m_endCUAddr + m_param->num4x4Partitions - 1) / m_param->num4x4Partitions;
1244
698
    const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
1245
1246
698
    SAOParam* saoParam = slice->m_sps->bUseSAO && slice->m_bUseSao ? m_frame->m_encData->m_saoParam : NULL;
1247
14.6k
    for (uint32_t cuAddr = sliceAddr; cuAddr < lastCUAddr; cuAddr++)
1248
13.9k
    {
1249
13.9k
        uint32_t col = cuAddr % widthInLCUs;
1250
13.9k
        uint32_t row = cuAddr / widthInLCUs;
1251
13.9k
        uint32_t subStrm = row % numSubstreams;
1252
13.9k
        CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr);
1253
1254
13.9k
        m_entropyCoder.setBitstream(&m_outStreams[subStrm]);
1255
1256
        // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
1257
13.9k
        if (m_param->bEnableWavefront && !col && row)
1258
2.23k
        {
1259
2.23k
            m_entropyCoder.copyState(m_initSliceContext);
1260
2.23k
            m_entropyCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
1261
2.23k
        }
1262
1263
        // Initialize slice context
1264
13.9k
        if (ctu->m_bFirstRowInSlice && !col)
1265
698
            m_entropyCoder.load(m_initSliceContext);
1266
1267
13.9k
        if (saoParam)
1268
13.9k
        {
1269
13.9k
            if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1270
13.9k
            {
1271
13.9k
                int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT;
1272
13.9k
                int mergeUp = !ctu->m_bFirstRowInSlice && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP;
1273
13.9k
                if (col)
1274
10.8k
                    m_entropyCoder.codeSaoMerge(mergeLeft);
1275
13.9k
                if (!ctu->m_bFirstRowInSlice && !mergeLeft)
1276
4.64k
                    m_entropyCoder.codeSaoMerge(mergeUp);
1277
13.9k
                if (!mergeLeft && !mergeUp)
1278
4.43k
                {
1279
4.43k
                    if (saoParam->bSaoFlag[0])
1280
4.43k
                        m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0);
1281
4.43k
                    if (saoParam->bSaoFlag[1])
1282
4.43k
                    {
1283
4.43k
                        m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1);
1284
4.43k
                        m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2);
1285
4.43k
                    }
1286
4.43k
                }
1287
13.9k
            }
1288
0
            else
1289
0
            {
1290
0
                for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
1291
0
                    saoParam->ctuParam[i][cuAddr].reset();
1292
0
            }
1293
13.9k
        }
1294
1295
        // final coding (bitstream generation) for this CU
1296
13.9k
        m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
1297
1298
13.9k
        if (m_param->bEnableWavefront)
1299
13.3k
        {
1300
13.3k
            if (col == 1)
1301
                // Store probabilities of second CTU in line into buffer
1302
2.82k
                m_rows[row].bufferedEntropy.loadContexts(m_entropyCoder);
1303
1304
13.3k
            if (col == widthInLCUs - 1)
1305
2.82k
                m_entropyCoder.finishSlice();
1306
13.3k
        }
1307
13.9k
    }
1308
1309
698
    if (!m_param->bEnableWavefront)
1310
114
        m_entropyCoder.finishSlice();
1311
698
}
1312
1313
void FrameEncoder::processRow(int row, int threadId)
1314
6.55k
{
1315
6.55k
    int64_t startTime = x265_mdate();
1316
6.55k
    if (ATOMIC_INC(&m_activeWorkerCount) == 1 && m_stallStartTime)
1317
1.32k
        m_totalNoWorkerTime += x265_mdate() - m_stallStartTime;
1318
1319
6.55k
    const uint32_t realRow = m_idx_to_row[row >> 1];
1320
6.55k
    const uint32_t typeNum = m_idx_to_row[row & 1];
1321
1322
6.55k
    if (!typeNum)
1323
3.73k
        processRowEncoder(realRow, m_tld[threadId]);
1324
2.82k
    else
1325
2.82k
    {
1326
2.82k
        m_frameFilter.processRow(realRow);
1327
1328
        // NOTE: Active next row
1329
2.82k
        if (realRow != m_sliceBaseRow[m_rows[realRow].sliceId + 1] - 1)
1330
2.23k
            enqueueRowFilter(m_row_to_idx[realRow + 1]);
1331
2.82k
    }
1332
1333
6.55k
    if (ATOMIC_DEC(&m_activeWorkerCount) == 0)
1334
1.91k
        m_stallStartTime = x265_mdate();
1335
1336
6.55k
    m_totalWorkerElapsedTime += x265_mdate() - startTime; // not thread safe, but good enough
1337
6.55k
}
1338
1339
// Called by worker threads
1340
void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
1341
4.07k
{
1342
4.07k
    const uint32_t row = (uint32_t)intRow;
1343
4.07k
    CTURow& curRow = m_rows[row];
1344
1345
4.07k
    if (m_param->bEnableWavefront)
1346
3.73k
    {
1347
3.73k
        ScopedLock self(curRow.lock);
1348
3.73k
        if (!curRow.active)
1349
            /* VBV restart is in progress, exit out */
1350
0
            return;
1351
3.73k
        if (curRow.busy)
1352
0
        {
1353
            /* On multi-socket Windows servers, we have seen problems with
1354
             * ATOMIC_CAS which resulted in multiple worker threads processing
1355
             * the same CU row, which often resulted in bad pointer accesses. We
1356
             * believe the problem is fixed, but are leaving this check in place
1357
             * to prevent crashes in case it is not */
1358
0
            x265_log(m_param, X265_LOG_WARNING,
1359
0
                     "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
1360
0
            return;
1361
0
        }
1362
3.73k
        curRow.busy = true;
1363
3.73k
    }
1364
1365
    /* When WPP is enabled, every row has its own row coder instance. Otherwise
1366
     * they share row 0 */
1367
4.07k
    Entropy& rowCoder = m_param->bEnableWavefront ? curRow.rowGoOnCoder : m_rows[0].rowGoOnCoder;
1368
4.07k
    FrameData& curEncData = *m_frame->m_encData;
1369
4.07k
    Slice *slice = curEncData.m_slice;
1370
1371
4.07k
    const uint32_t numCols = m_numCols;
1372
4.07k
    const uint32_t lineStartCUAddr = row * numCols;
1373
4.07k
    bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1374
1375
4.07k
    const uint32_t sliceId = curRow.sliceId;
1376
4.07k
    uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
1377
4.07k
    uint32_t noOfBlocks = m_param->maxCUSize / 16;
1378
4.07k
    const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0;
1379
4.07k
    const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
1380
4.07k
    const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
1381
4.07k
    const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
1382
1383
    // Load SBAC coder context from previous row and initialize row state.
1384
4.07k
    if (bFirstRowInSlice && !curRow.completed)        
1385
698
        rowCoder.load(m_initSliceContext);     
1386
1387
    // calculate mean QP for consistent deltaQP signalling calculation
1388
4.07k
    if (m_param->bOptCUDeltaQP)
1389
0
    {
1390
0
        ScopedLock self(curRow.lock);
1391
0
        if (!curRow.avgQPComputed)
1392
0
        {
1393
0
            if (m_param->bEnableWavefront || !row)
1394
0
            {
1395
0
                double meanQPOff = 0;
1396
0
                bool isReferenced = IS_REFERENCED(m_frame);
1397
0
                double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
1398
0
                if (qpoffs)
1399
0
                {
1400
0
                    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
1401
1402
0
                    uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_picHeight;
1403
0
                    if (m_param->bEnableWavefront)
1404
0
                    {
1405
0
                        cuYStart = intRow * m_param->maxCUSize;
1406
0
                        height = cuYStart + m_param->maxCUSize;
1407
0
                    }
1408
1409
0
                    uint32_t qgSize = m_param->rc.qgSize, width = m_frame->m_fencPic->m_picWidth;
1410
0
                    uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
1411
0
                    uint32_t count = 0;
1412
0
                    for (uint32_t cuY = cuYStart; cuY < height && (cuY < m_frame->m_fencPic->m_picHeight); cuY += qgSize)
1413
0
                    {
1414
0
                        for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
1415
0
                        {
1416
0
                            double qp_offset = 0;
1417
0
                            uint32_t cnt = 0;
1418
1419
0
                            for (uint32_t block_yy = cuY; block_yy < cuY + qgSize && block_yy < m_frame->m_fencPic->m_picHeight; block_yy += loopIncr)
1420
0
                            {
1421
0
                                for (uint32_t block_xx = cuX; block_xx < cuX + qgSize && block_xx < width; block_xx += loopIncr)
1422
0
                                {
1423
0
                                    int idx = ((block_yy / loopIncr) * (maxOffsetCols)) + (block_xx / loopIncr);
1424
0
                                    qp_offset += qpoffs[idx];
1425
0
                                    cnt++;
1426
0
                                }
1427
0
                            }
1428
0
                            qp_offset /= cnt;
1429
0
                            meanQPOff += qp_offset;
1430
0
                            count++;
1431
0
                        }
1432
0
                    }
1433
0
                    meanQPOff /= count;
1434
0
                }
1435
0
                rowCoder.m_meanQP = slice->m_sliceQp + meanQPOff;
1436
0
            }
1437
0
            else
1438
0
            {
1439
0
                rowCoder.m_meanQP = m_rows[0].rowGoOnCoder.m_meanQP;
1440
0
            }
1441
0
            curRow.avgQPComputed = 1;
1442
0
        }
1443
0
    }
1444
1445
    // Initialize restrict on MV range in slices
1446
4.07k
    tld.analysis.m_sliceMinY = -(int32_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
1447
4.07k
    tld.analysis.m_sliceMaxY = (int32_t)((endRowInSlicePlus1 - 1 - row) * (m_param->maxCUSize * 4) - 4 * 4);
1448
1449
    // Handle single row slice
1450
4.07k
    if (tld.analysis.m_sliceMaxY < tld.analysis.m_sliceMinY)
1451
16
        tld.analysis.m_sliceMaxY = tld.analysis.m_sliceMinY = 0;
1452
1453
1454
17.1k
    while (curRow.completed < numCols)
1455
13.9k
    {
1456
13.9k
        ProfileScopeEvent(encodeCTU);
1457
1458
13.9k
        const uint32_t col = curRow.completed;
1459
13.9k
        const uint32_t cuAddr = lineStartCUAddr + col;
1460
13.9k
        CUData* ctu = curEncData.getPicCTU(cuAddr);
1461
13.9k
        const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
1462
13.9k
        ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
1463
1464
13.9k
        if (bIsVbv)
1465
0
        {
1466
0
            if (col == 0 && !m_param->bEnableWavefront)
1467
0
            {
1468
0
                m_backupStreams[0].copyBits(&m_outStreams[0]);
1469
0
                curRow.bufferedEntropy.copyState(rowCoder);
1470
0
                curRow.bufferedEntropy.loadContexts(rowCoder);
1471
0
            }
1472
0
            if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)            
1473
0
            {
1474
0
                curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
1475
0
                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
1476
0
            }
1477
1478
0
            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
1479
0
            if (m_param->bEnableWavefront && rowInSlice >= col && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1480
0
                cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
1481
0
            else if (!m_param->bEnableWavefront && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
1482
0
                cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
1483
0
            else
1484
0
                cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
1485
1486
            /* TODO: use defines from slicetype.h for lowres block size */
1487
0
            uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks;
1488
0
            uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks;
1489
0
            if (!m_param->analysisLoad || !m_param->bDisableLookahead)
1490
0
            {
1491
0
                cuStat.vbvCost = 0;
1492
0
                cuStat.intraVbvCost = 0;
1493
1494
0
                for (uint32_t h = 0; h < noOfBlocks && block_y < m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
1495
0
                {
1496
0
                    uint32_t idx = block_x + (block_y * maxBlockCols);
1497
1498
0
                    for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++, idx++)
1499
0
                    {
1500
0
                        cuStat.vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
1501
0
                        cuStat.intraVbvCost += m_frame->m_lowres.intraCost[idx];
1502
0
                    }
1503
0
                }
1504
0
            }
1505
0
        }
1506
13.9k
        else
1507
13.9k
            curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc;
1508
1509
13.9k
        if (m_param->bEnableWavefront && !col && !bFirstRowInSlice)
1510
2.23k
        {
1511
            // Load SBAC coder context from previous row and initialize row state.
1512
2.23k
            rowCoder.copyState(m_initSliceContext);
1513
2.23k
            rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
1514
2.23k
        }
1515
13.9k
        if (m_param->dynamicRd && (int32_t)(m_rce.qpaRc - m_rce.qpNoVbv) > 0)
1516
0
            ctu->m_vbvAffected = true;
1517
1518
        // Does all the CU analysis, returns best top level mode decision
1519
13.9k
        Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
1520
1521
        /* startPoint > encodeOrder is true when the start point changes for
1522
        a new GOP but few frames from the previous GOP is still incomplete.
1523
        The data of frames in this interval will not be used by any future frames. */
1524
13.9k
        if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder)
1525
0
            collectDynDataRow(*ctu, &curRow.rowStats);
1526
1527
        // take a sample of the current active worker count
1528
13.9k
        ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
1529
13.9k
        ATOMIC_INC(&m_activeWorkerCountSamples);
1530
1531
        /* advance top-level row coder to include the context of this CTU.
1532
         * if SAO is disabled, rowCoder writes the final CTU bitstream */
1533
13.9k
        rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
1534
1535
13.9k
        if (m_param->bEnableWavefront && col == 1)
1536
            // Save CABAC state for next row
1537
2.82k
            curRow.bufferedEntropy.loadContexts(rowCoder);
1538
1539
        /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
1540
13.9k
        if (slice->m_bUseSao && m_param->bSaoNonDeblocked)
1541
0
            m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
1542
1543
        /* Deblock with idle threading */
1544
13.9k
        if (m_param->bEnableLoopFilter | slice->m_bUseSao)
1545
13.9k
        {
1546
            // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
1547
13.9k
            if (!bIsVbv)
1548
13.9k
            {
1549
                // Delay one row to avoid intra prediction conflict
1550
13.9k
                if (m_pool && !bFirstRowInSlice)
1551
10.8k
                {                    
1552
10.8k
                    int allowCol = col;
1553
1554
                    // avoid race condition on last column
1555
10.8k
                    if (rowInSlice >= 2)
1556
7.82k
                    {
1557
7.82k
                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
1558
7.82k
                                                                  : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
1559
7.82k
                    }
1560
10.8k
                    m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
1561
10.8k
                }
1562
1563
                // Last Row may start early
1564
13.9k
                if (m_pool && bLastRowInSlice)
1565
3.10k
                {
1566
                    // Deblocking last row
1567
3.10k
                    int allowCol = col;
1568
1569
                    // avoid race condition on last column
1570
3.10k
                    if (rowInSlice >= 2)
1571
2.48k
                    {
1572
2.48k
                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
1573
2.48k
                                                                  : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
1574
2.48k
                    }
1575
3.10k
                    m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
1576
3.10k
                }
1577
13.9k
            } // end of !bIsVbv
1578
13.9k
        }
1579
        // Both Loopfilter and SAO Disabled
1580
6
        else
1581
6
        {
1582
6
            m_frameFilter.m_parallelFilter[row].processPostCu(col);
1583
6
        }
1584
1585
        // Completed CU processing
1586
13.9k
        curRow.completed++;
1587
1588
13.9k
        FrameStats frameLog;
1589
13.9k
        curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
1590
1591
        // copy number of intra, inter cu per row into frame stats for 2 pass
1592
13.9k
        if (m_param->rc.bStatWrite)
1593
0
        {
1594
0
            curRow.rowStats.mvBits    += best.mvBits;
1595
0
            curRow.rowStats.coeffBits += best.coeffBits;
1596
0
            curRow.rowStats.miscBits  += best.totalBits - (best.mvBits + best.coeffBits);
1597
1598
0
            for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
1599
0
            {
1600
                /* 1 << shift == number of 8x8 blocks at current depth */
1601
0
                int shift = 2 * (m_param->maxCUDepth - depth);
1602
0
                int cuSize = m_param->maxCUSize >> depth;
1603
1604
0
                curRow.rowStats.intra8x8Cnt += (cuSize == 8) ? (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
1605
0
                                                               (int)(frameLog.cntIntra[depth] << shift);
1606
1607
0
                curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
1608
0
                curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
1609
0
            }
1610
0
        }
1611
13.9k
        curRow.rowStats.totalCtu++;
1612
13.9k
        curRow.rowStats.lumaDistortion   += best.lumaDistortion;
1613
13.9k
        curRow.rowStats.chromaDistortion += best.chromaDistortion;
1614
13.9k
        curRow.rowStats.psyEnergy        += best.psyEnergy;
1615
13.9k
        curRow.rowStats.ssimEnergy       += best.ssimEnergy;
1616
13.9k
        curRow.rowStats.resEnergy        += best.resEnergy;
1617
13.9k
        curRow.rowStats.cntIntraNxN      += frameLog.cntIntraNxN;
1618
13.9k
        curRow.rowStats.totalCu          += frameLog.totalCu;
1619
55.1k
        for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++)
1620
41.1k
        {
1621
41.1k
            curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth];
1622
41.1k
            curRow.rowStats.cntMergeCu[depth] += frameLog.cntMergeCu[depth];
1623
205k
            for (int m = 0; m < INTER_MODES; m++)
1624
164k
                curRow.rowStats.cuInterDistribution[depth][m] += frameLog.cuInterDistribution[depth][m];
1625
164k
            for (int n = 0; n < INTRA_MODES; n++)
1626
123k
                curRow.rowStats.cuIntraDistribution[depth][n] += frameLog.cuIntraDistribution[depth][n];
1627
41.1k
        }
1628
1629
13.9k
        curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
1630
13.9k
        x265_emms();
1631
1632
13.9k
        if (bIsVbv)
1633
0
        {   
1634
            // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled
1635
0
            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];    
1636
0
            if ((m_param->bEnableWavefront && ((cuAddr == m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
1637
0
            {
1638
0
                curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
1639
0
                curEncData.m_rowStat[row].rowIntraSatd += cuStat.intraVbvCost;
1640
0
                curEncData.m_rowStat[row].encodedBits += cuStat.totalBits;
1641
0
                curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
1642
0
                curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
1643
0
            }
1644
            
1645
            // If current block is at row end checkpoint, call vbv ratecontrol.
1646
0
            if (!m_param->bEnableWavefront && col == numCols - 1)
1647
0
            {
1648
0
                double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
1649
0
                curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
1650
0
                qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
1651
0
                curEncData.m_rowStat[row].rowQp = qpBase;
1652
0
                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
1653
0
                if (curRow.reEncode < 0)
1654
0
                {
1655
0
                    x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
1656
0
                        m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
1657
1658
0
                    m_vbvResetTriggerRow = row;
1659
0
                    m_outStreams[0].copyBits(&m_backupStreams[0]);
1660
1661
0
                    rowCoder.copyState(curRow.bufferedEntropy);
1662
0
                    rowCoder.loadContexts(curRow.bufferedEntropy);
1663
1664
0
                    curRow.completed = 0;
1665
0
                    memset(&curRow.rowStats, 0, sizeof(curRow.rowStats));
1666
0
                    curEncData.m_rowStat[row].numEncodedCUs = 0;
1667
0
                    curEncData.m_rowStat[row].encodedBits = 0;
1668
0
                    curEncData.m_rowStat[row].rowSatd = 0;
1669
0
                    curEncData.m_rowStat[row].rowIntraSatd = 0;
1670
0
                    curEncData.m_rowStat[row].sumQpRc = 0;
1671
0
                    curEncData.m_rowStat[row].sumQpAq = 0;
1672
0
                }
1673
0
            }
1674
            // If current block is at row diagonal checkpoint, call vbv ratecontrol.
1675
0
            else if (m_param->bEnableWavefront && rowInSlice == col && !bFirstRowInSlice)
1676
0
            {
1677
0
                if (m_param->rc.bEnableConstVbv)
1678
0
                {
1679
0
                    uint32_t startCuAddr = numCols * row;
1680
0
                    uint32_t EndCuAddr = startCuAddr + col;
1681
1682
0
                    for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId]; r--)
1683
0
                    {
1684
0
                        for (uint32_t c = startCuAddr; c <= EndCuAddr && c <= numCols * (r + 1) - 1; c++)
1685
0
                        {
1686
0
                            curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
1687
0
                            curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
1688
0
                            curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
1689
0
                            curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
1690
0
                            curEncData.m_rowStat[r].numEncodedCUs = c;
1691
0
                        }
1692
0
                        if (curRow.reEncode < 0)
1693
0
                            break;
1694
0
                        startCuAddr = EndCuAddr - numCols;
1695
0
                        EndCuAddr = startCuAddr + 1;
1696
0
                    }
1697
0
                }
1698
0
                double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
1699
0
                curRow.reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
1700
0
                qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
1701
0
                curEncData.m_rowStat[row].rowQp = qpBase;
1702
0
                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
1703
1704
0
                if (curRow.reEncode < 0)
1705
0
                {
1706
0
                    x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
1707
0
                             m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
1708
1709
                    // prevent the WaveFront::findJob() method from providing new jobs
1710
0
                    m_vbvResetTriggerRow = row;
1711
0
                    m_bAllRowsStop = true;
1712
1713
0
                    for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r >= row; r--)
1714
0
                    {
1715
0
                        CTURow& stopRow = m_rows[r];
1716
1717
0
                        if (r != row)
1718
0
                        {
1719
                            /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
1720
0
                            stopRow.lock.acquire();
1721
0
                            while (stopRow.active)
1722
0
                            {
1723
0
                                if (dequeueRow(r * 2))
1724
0
                                    stopRow.active = false;
1725
0
                                else
1726
0
                                {
1727
                                    /* we must release the row lock to allow the thread to exit */
1728
0
                                    stopRow.lock.release();
1729
0
                                    GIVE_UP_TIME();
1730
0
                                    stopRow.lock.acquire();
1731
0
                                }
1732
0
                            }
1733
0
                            stopRow.lock.release();
1734
1735
0
                            bool bRowBusy = true;
1736
0
                            do
1737
0
                            {
1738
0
                                stopRow.lock.acquire();
1739
0
                                bRowBusy = stopRow.busy;
1740
0
                                stopRow.lock.release();
1741
1742
0
                                if (bRowBusy)
1743
0
                                {
1744
0
                                    GIVE_UP_TIME();
1745
0
                                }
1746
0
                            }
1747
0
                            while (bRowBusy);
1748
0
                        }
1749
1750
0
                        m_outStreams[r].resetBits();
1751
0
                        stopRow.completed = 0;
1752
0
                        memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats));
1753
0
                        curEncData.m_rowStat[r].numEncodedCUs = 0;
1754
0
                        curEncData.m_rowStat[r].encodedBits = 0;
1755
0
                        curEncData.m_rowStat[r].rowSatd = 0;
1756
0
                        curEncData.m_rowStat[r].rowIntraSatd = 0;
1757
0
                        curEncData.m_rowStat[r].sumQpRc = 0;
1758
0
                        curEncData.m_rowStat[r].sumQpAq = 0;
1759
0
                    }
1760
1761
0
                    m_bAllRowsStop = false;
1762
0
                }
1763
0
            }
1764
0
        }
1765
1766
13.9k
        if (m_param->bEnableWavefront && curRow.completed >= 2 && !bLastRowInSlice &&
1767
13.9k
            (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
1768
8.20k
        {
1769
            /* activate next row */
1770
8.20k
            ScopedLock below(m_rows[row + 1].lock);
1771
1772
8.20k
            if (m_rows[row + 1].active == false &&
1773
8.20k
                m_rows[row + 1].completed + 2 <= curRow.completed)
1774
3.14k
            {
1775
3.14k
                m_rows[row + 1].active = true;
1776
3.14k
                enqueueRowEncoder(m_row_to_idx[row + 1]);
1777
3.14k
                tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */
1778
3.14k
            }
1779
8.20k
        }
1780
1781
13.9k
        ScopedLock self(curRow.lock);
1782
13.9k
        if ((m_bAllRowsStop && intRow > m_vbvResetTriggerRow) ||
1783
13.9k
            (!bFirstRowInSlice && ((curRow.completed < numCols - 1) || (m_rows[row - 1].completed < numCols)) && m_rows[row - 1].completed < curRow.completed + 2))
1784
907
        {
1785
907
            curRow.active = false;
1786
907
            curRow.busy = false;
1787
907
            ATOMIC_INC(&m_countRowBlocks);
1788
907
            return;
1789
907
        }
1790
13.9k
    }
1791
1792
    /* this row of CTUs has been compressed */
1793
3.17k
    if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
1794
0
    {
1795
0
        if (bLastRowInSlice)       
1796
0
        {
1797
0
            for (uint32_t r = m_sliceBaseRow[sliceId]; r < m_sliceBaseRow[sliceId + 1]; r++)
1798
0
            {
1799
0
                for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < numCols * (r + 1); c++)
1800
0
                {
1801
0
                    curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
1802
0
                    curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
1803
0
                    curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits;
1804
0
                    curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp;
1805
0
                    curEncData.m_rowStat[r].numEncodedCUs = c;
1806
0
                }
1807
0
            }
1808
0
        }
1809
0
    }
1810
1811
    /* If encoding with ABR, update update bits and complexity in rate control
1812
     * after a number of rows so the next frame's rateControlStart has more
1813
     * accurate data for estimation. At the start of the encode we update stats
1814
     * after half the frame is encoded, but after this initial period we update
1815
     * after refLagRows (the number of rows reference frames must have completed
1816
     * before referencees may begin encoding) */
1817
3.17k
    if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
1818
0
    {
1819
0
        uint32_t rowCount = 0;
1820
0
        uint32_t maxRows = m_sliceBaseRow[sliceId + 1] - m_sliceBaseRow[sliceId];
1821
1822
0
        if (!m_rce.encodeOrder)
1823
0
            rowCount = maxRows - 1; 
1824
0
        else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
1825
0
            rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
1826
0
        else
1827
0
            rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
1828
1829
0
        if (rowInSlice == rowCount)
1830
0
        {
1831
0
            m_rowSliceTotalBits[sliceId] = 0;
1832
0
            if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront))
1833
0
            {
1834
0
                for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
1835
0
                    m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
1836
0
            }
1837
0
            else
1838
0
            {
1839
0
                uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
1840
0
                uint32_t finishAddr = startAddr + rowCount * numCols;
1841
                
1842
0
                for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
1843
0
                    m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
1844
0
            }
1845
1846
0
            if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
1847
0
            {
1848
0
                m_rce.rowTotalBits = 0;
1849
0
                for (uint32_t i = 0; i < m_param->maxSlices; i++)
1850
0
                    m_rce.rowTotalBits += m_rowSliceTotalBits[i];
1851
0
                m_top->m_rateControl->rateControlUpdateStats(&m_rce);
1852
0
            }
1853
0
        }
1854
0
    }
1855
1856
    /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
1857
    /* end_of_sub_stream_one_bit / end_of_slice_segment_flag */
1858
3.17k
       if (!slice->m_bUseSao && (m_param->bEnableWavefront || bLastRowInSlice))
1859
0
               rowCoder.finishSlice();
1860
1861
1862
    /* Processing left Deblock block with current threading */
1863
3.17k
    if ((m_param->bEnableLoopFilter | slice->m_bUseSao) & (rowInSlice >= 2))
1864
1.79k
    {
1865
        /* Check conditional to start previous row process with current threading */
1866
1.79k
        if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
1867
1.79k
        {
1868
            /* stop threading on current row and restart it */
1869
1.79k
            m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
1870
1.79k
            m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
1871
1.79k
        }
1872
1.79k
    }
1873
1874
    /* trigger row-wise loop filters */
1875
3.17k
    if (m_param->bEnableWavefront)
1876
2.82k
    {
1877
2.82k
        if (rowInSlice >= m_filterRowDelay)
1878
2.23k
        {
1879
2.23k
            enableRowFilter(m_row_to_idx[row - m_filterRowDelay]);
1880
1881
            /* NOTE: Activate filter if first row (row 0) */
1882
2.23k
            if (rowInSlice == m_filterRowDelay)
1883
584
                enqueueRowFilter(m_row_to_idx[row - m_filterRowDelay]);
1884
2.23k
            tryWakeOne();
1885
2.23k
        }
1886
1887
2.82k
        if (bLastRowInSlice)
1888
584
        {
1889
1.16k
            for (uint32_t i = endRowInSlicePlus1 - m_filterRowDelay; i < endRowInSlicePlus1; i++)
1890
584
            {
1891
584
                enableRowFilter(m_row_to_idx[i]);
1892
584
            }
1893
584
            tryWakeOne();
1894
584
        }
1895
1896
        // handle specially case - single row slice
1897
2.82k
        if  (bFirstRowInSlice & bLastRowInSlice)
1898
0
        {
1899
0
            enqueueRowFilter(m_row_to_idx[row]);
1900
0
            tryWakeOne();
1901
0
        }
1902
2.82k
    }
1903
1904
3.17k
    curRow.busy = false;
1905
1906
    // CHECK_ME: Does it always FALSE condition?
1907
3.17k
    if (ATOMIC_INC(&m_completionCount) == 2 * (int)m_numRows)
1908
0
        m_completionEvent.trigger();
1909
3.17k
}
1910
1911
void FrameEncoder::collectDynDataRow(CUData& ctu, FrameStats* rowStats)
1912
0
{
1913
0
    for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
1914
0
    {
1915
0
        for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1916
0
        {
1917
0
            int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
1918
0
            if (ctu.m_collectCUCount[offset])
1919
0
            {
1920
0
                rowStats->rowVarDyn[offset] += ctu.m_collectCUVariance[offset];
1921
0
                rowStats->rowRdDyn[offset] += ctu.m_collectCURd[offset];
1922
0
                rowStats->rowCntDyn[offset] += ctu.m_collectCUCount[offset];
1923
0
            }
1924
0
        }
1925
0
    }
1926
0
}
1927
1928
void FrameEncoder::collectDynDataFrame()
1929
0
{
1930
0
    for (uint32_t row = 0; row < m_numRows; row++)
1931
0
    {
1932
0
        for (uint32_t refLevel = 0; refLevel < X265_REFINE_INTER_LEVELS; refLevel++)
1933
0
        {
1934
0
            for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1935
0
            {
1936
0
                int offset = (depth * X265_REFINE_INTER_LEVELS) + refLevel;
1937
0
                int curFrameIndex = m_frame->m_encodeOrder - m_top->m_startPoint;
1938
0
                int index = (curFrameIndex * X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
1939
0
                if (m_rows[row].rowStats.rowCntDyn[offset])
1940
0
                {
1941
0
                    m_top->m_variance[index] += m_rows[row].rowStats.rowVarDyn[offset];
1942
0
                    m_top->m_rdCost[index] += m_rows[row].rowStats.rowRdDyn[offset];
1943
0
                    m_top->m_trainingCount[index] += m_rows[row].rowStats.rowCntDyn[offset];
1944
0
                }
1945
0
            }
1946
0
        }
1947
0
    }
1948
0
}
1949
1950
void FrameEncoder::computeAvgTrainingData()
1951
0
{
1952
0
    if (m_frame->m_lowres.bScenecut || m_frame->m_lowres.bKeyframe)
1953
0
    {
1954
0
        m_top->m_startPoint = m_frame->m_encodeOrder;
1955
0
        int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
1956
0
        memset(m_top->m_variance, 0, size * sizeof(uint64_t));
1957
0
        memset(m_top->m_rdCost, 0, size * sizeof(uint64_t));
1958
0
        memset(m_top->m_trainingCount, 0, size * sizeof(uint32_t));
1959
0
    }
1960
0
    if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * m_param->frameNumThreads)
1961
0
        m_frame->m_classifyFrame = false;
1962
0
    else
1963
0
        m_frame->m_classifyFrame = true;
1964
1965
0
    int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
1966
0
    memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
1967
0
    memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
1968
0
    memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
1969
0
    if (m_frame->m_classifyFrame)
1970
0
    {
1971
0
        uint32_t limit = m_frame->m_encodeOrder - m_top->m_startPoint - m_param->frameNumThreads;
1972
0
        for (uint32_t i = 1; i < limit; i++)
1973
0
        {
1974
0
            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
1975
0
            {
1976
0
                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1977
0
                {
1978
0
                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
1979
0
                    int index = (i* X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
1980
0
                    if (m_top->m_trainingCount[index])
1981
0
                    {
1982
0
                        m_frame->m_classifyRd[offset] += m_top->m_rdCost[index] / m_top->m_trainingCount[index];
1983
0
                        m_frame->m_classifyVariance[offset] += m_top->m_variance[index] / m_top->m_trainingCount[index];
1984
0
                        m_frame->m_classifyCount[offset] += m_top->m_trainingCount[index];
1985
0
                    }
1986
0
                }
1987
0
            }
1988
0
        }
1989
        /* Calculates the average feature values of historic frames that are being considered for the current frame */
1990
0
        int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - m_top->m_startPoint - 1;
1991
0
        if (historyCount)
1992
0
        {
1993
0
            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
1994
0
            {
1995
0
                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
1996
0
                {
1997
0
                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
1998
0
                    m_frame->m_classifyRd[offset] /= historyCount;
1999
0
                    m_frame->m_classifyVariance[offset] /= historyCount;
2000
0
                }
2001
0
            }
2002
0
        }
2003
0
    }
2004
0
}
2005
2006
/* collect statistics about CU coding decisions, return total QP */
2007
int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
2008
13.9k
{
2009
13.9k
    int totQP = 0;
2010
13.9k
    uint32_t depth = 0;
2011
79.8k
    for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2012
65.8k
    {
2013
65.8k
        depth = ctu.m_cuDepth[absPartIdx];
2014
65.8k
        totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2));
2015
65.8k
    }
2016
2017
13.9k
    if (m_param->csvLogLevel >= 1 || m_param->rc.bStatWrite)
2018
0
    {
2019
0
        if (ctu.m_slice->m_sliceType == I_SLICE)
2020
0
        {
2021
0
            depth = 0;
2022
0
            for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2023
0
            {
2024
0
                depth = ctu.m_cuDepth[absPartIdx];
2025
2026
0
                log->totalCu++;
2027
0
                log->cntIntra[depth]++;
2028
2029
0
                if (ctu.m_predMode[absPartIdx] == MODE_NONE)
2030
0
                {
2031
0
                    log->totalCu--;
2032
0
                    log->cntIntra[depth]--;
2033
0
                }
2034
0
                else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
2035
0
                {
2036
                    /* TODO: log intra modes at absPartIdx +0 to +3 */
2037
0
                    X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
2038
0
                    log->cntIntraNxN++;
2039
0
                    log->cntIntra[depth]--;
2040
0
                }
2041
0
                else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
2042
0
                    log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
2043
0
                else
2044
0
                    log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
2045
0
            }
2046
0
        }
2047
0
        else
2048
0
        {
2049
0
            depth = 0;
2050
0
            for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
2051
0
            {
2052
0
                depth = ctu.m_cuDepth[absPartIdx];
2053
2054
0
                log->totalCu++;
2055
2056
0
                if (ctu.m_predMode[absPartIdx] == MODE_NONE)
2057
0
                    log->totalCu--;
2058
0
                else if (ctu.isSkipped(absPartIdx))
2059
0
                {
2060
0
                    if (ctu.m_mergeFlag[0])
2061
0
                        log->cntMergeCu[depth]++;
2062
0
                    else
2063
0
                        log->cntSkipCu[depth]++;
2064
0
                }
2065
0
                else if (ctu.isInter(absPartIdx))
2066
0
                {
2067
0
                    log->cntInter[depth]++;
2068
2069
0
                    if (ctu.m_partSize[absPartIdx] < AMP_ID)
2070
0
                        log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
2071
0
                    else
2072
0
                        log->cuInterDistribution[depth][AMP_ID]++;
2073
0
                }
2074
0
                else if (ctu.isIntra(absPartIdx))
2075
0
                {
2076
0
                    log->cntIntra[depth]++;
2077
2078
0
                    if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
2079
0
                    {
2080
0
                        X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
2081
0
                        log->cntIntraNxN++;
2082
0
                        log->cntIntra[depth]--;
2083
                        /* TODO: log intra modes at absPartIdx +0 to +3 */
2084
0
                    }
2085
0
                    else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
2086
0
                        log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
2087
0
                    else
2088
0
                        log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
2089
0
                }
2090
0
            }
2091
0
        }
2092
0
    }
2093
2094
13.9k
    return totQP;
2095
13.9k
}
2096
2097
/* DCT-domain noise reduction / adaptive deadzone from libavcodec */
2098
void FrameEncoder::noiseReductionUpdate()
2099
0
{
2100
0
    static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
2101
2102
0
    for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
2103
0
    {
2104
0
        int trSize = cat & 3;
2105
0
        int coefCount = 1 << ((trSize + 2) * 2);
2106
2107
0
        if (m_nr->nrCount[cat] > maxBlocksPerTrSize[trSize])
2108
0
        {
2109
0
            for (int i = 0; i < coefCount; i++)
2110
0
                m_nr->nrResidualSum[cat][i] >>= 1;
2111
0
            m_nr->nrCount[cat] >>= 1;
2112
0
        }
2113
2114
0
        int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
2115
0
        uint64_t scaledCount = (uint64_t)nrStrength * m_nr->nrCount[cat];
2116
2117
0
        for (int i = 0; i < coefCount; i++)
2118
0
        {
2119
0
            uint64_t value = scaledCount + m_nr->nrResidualSum[cat][i] / 2;
2120
0
            uint64_t denom = m_nr->nrResidualSum[cat][i] + 1;
2121
0
            m_nr->nrOffsetDenoise[cat][i] = (uint16_t)(value / denom);
2122
0
        }
2123
2124
        // Don't denoise DC coefficients
2125
0
        m_nr->nrOffsetDenoise[cat][0] = 0;
2126
0
    }
2127
0
}
2128
#if ENABLE_LIBVMAF
2129
void FrameEncoder::vmafFrameLevelScore()
2130
{
2131
    PicYuv *fenc = m_frame->m_fencPic;
2132
    PicYuv *recon = m_frame->m_reconPic;
2133
2134
    x265_vmaf_framedata *vmafframedata = (x265_vmaf_framedata*)x265_malloc(sizeof(x265_vmaf_framedata));
2135
    if (!vmafframedata)
2136
    {
2137
        x265_log(NULL, X265_LOG_ERROR, "vmaf frame data alloc failed\n");
2138
    }
2139
2140
    vmafframedata->height = fenc->m_picHeight;
2141
    vmafframedata->width = fenc->m_picWidth;
2142
    vmafframedata->frame_set = 0;
2143
    vmafframedata->internalBitDepth = m_param->internalBitDepth;
2144
    vmafframedata->reference_frame = fenc;
2145
    vmafframedata->distorted_frame = recon;
2146
2147
    fenc->m_vmafScore = x265_calculate_vmaf_framelevelscore(vmafframedata);
2148
2149
    if (vmafframedata)
2150
    x265_free(vmafframedata);
2151
}
2152
#endif
2153
2154
Frame *FrameEncoder::getEncodedPicture(NALList& output)
2155
7.58k
{
2156
7.58k
    if (m_frame)
2157
698
    {
2158
        /* block here until worker thread completes */
2159
698
        m_done.wait();
2160
2161
698
        Frame *ret = m_frame;
2162
698
        m_frame = NULL;
2163
698
        output.takeContents(m_nalList);
2164
698
        m_prevOutputTime = x265_mdate();
2165
698
        return ret;
2166
698
    }
2167
2168
6.88k
    return NULL;
2169
7.58k
}
2170
}