Coverage Report

Created: 2026-03-08 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/x265/source/encoder/threadedme.cpp
Line
Count
Source
1
/*****************************************************************************
2
 * Copyright (C) 2013-2025 MulticoreWare, Inc
3
 *
4
 * Authors: Shashank Pathipati <shashank.pathipati@multicorewareinc.com>
5
 *          Somu Vineela <somu@mutlicorewareinc.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 2 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
 *
21
 * This program is also available under a commercial proprietary license.
22
 * For more information, contact us at license @ x265.com.
23
 *****************************************************************************/
24
25
#include "threadedme.h"
26
#include "encoder.h"
27
#include "frameencoder.h"
28
29
#include <iostream>
30
#include <sstream>
31
32
namespace X265_NS {
33
int g_puStartIdx[128][8] = {0};
34
35
bool ThreadedME::create()
36
0
{
37
0
    m_active = true;
38
0
    m_tldCount = m_pool->m_numWorkers;
39
0
    m_tld = new ThreadLocalData[m_tldCount];
40
0
    for (int i = 0; i < m_tldCount; i++)
41
0
    {
42
0
        m_tld[i].analysis.initSearch(*m_param, m_enc.m_scalingList);
43
0
        m_tld[i].analysis.create(m_tld);
44
0
    }
45
46
0
    initPuStartIdx();
47
48
    /* start sequence at zero */
49
0
    m_enqueueSeq = 0ULL;
50
51
0
    return true;
52
0
}
53
54
void ThreadedME::initPuStartIdx()
55
0
{
56
0
    int startIdx = 0;
57
0
    uint32_t ctuSize = m_param->maxCUSize;
58
59
0
    for (uint32_t puIdx = 0; puIdx < MAX_NUM_PU_SIZES; ++puIdx)
60
0
    {
61
0
        const PUBlock& pu = g_puLookup[puIdx];
62
63
0
        if (pu.width > ctuSize || pu.height > ctuSize)
64
0
            continue;
65
66
0
        int indexWidth = pu.isAmp ? X265_MAX(pu.width, pu.height) : pu.width;
67
0
        int indexHeight = pu.isAmp ? indexWidth : pu.height;
68
69
0
        int numPUs = (ctuSize / indexWidth) * (ctuSize / indexHeight);
70
0
        int partIdx = static_cast<int>(pu.partsize);
71
72
0
        g_puStartIdx[pu.width + pu.height][partIdx] = startIdx;
73
74
0
        startIdx += pu.isAmp ? 2 * numPUs : numPUs;
75
0
    }
76
0
}
77
78
void ThreadedME::enqueueCTUBlock(int row, int col, int width, int height, int layer, FrameEncoder* frameEnc)
79
0
{
80
0
    frameEnc->m_tmeTasksLock.acquire();
81
82
0
    Frame* frame = frameEnc->m_frame[layer];
83
84
0
    CTUTask task;
85
0
    task.seq = ATOMIC_ADD(&m_enqueueSeq, 1ULL);
86
0
    task.row = row;
87
0
    task.col = col;
88
0
    task.width = width;
89
0
    task.height = height;
90
0
    task.layer = layer;
91
92
0
    task.frame = frame;
93
0
    task.frameEnc = frameEnc;
94
95
0
    frameEnc->m_tmeTasks.push(task);
96
0
    frameEnc->m_tmeTasksLock.release();
97
98
0
    m_taskEvent.trigger();
99
0
}
100
101
void ThreadedME::enqueueReadyRows(int row, int layer, FrameEncoder* frameEnc)
102
0
{
103
0
    int bufRow = X265_MIN(row + m_param->tmeNumBufferRows, static_cast<int>(frameEnc->m_numRows));
104
105
0
    for (int r = 0; r < bufRow; r++)
106
0
    {
107
0
        if (frameEnc->m_tmeDeps[r].isQueued)
108
0
            continue;
109
110
0
        bool isInitialRow = r < m_param->tmeNumBufferRows;
111
0
        bool isExternalDepResolved = frameEnc->m_tmeDeps[r].external;
112
113
0
        int prevRow = X265_MAX(0, r - m_param->tmeNumBufferRows);
114
0
        bool isInternalDepResolved = frameEnc->m_tmeDeps[prevRow].internal;
115
116
0
        if ((isInitialRow && isExternalDepResolved) ||
117
0
            (!isInitialRow && isExternalDepResolved && isInternalDepResolved))
118
0
        {
119
0
            int cols = static_cast<int>(frameEnc->m_numCols);
120
0
            for (int c = 0; c < cols; c += m_param->tmeTaskBlockSize)
121
0
            {
122
0
                int blockWidth = X265_MIN(m_param->tmeTaskBlockSize, cols - c);
123
0
                enqueueCTUBlock(r, c, blockWidth, 1, layer, frameEnc);
124
0
            }
125
0
            frameEnc->m_tmeDeps[r].isQueued = true;
126
0
        }
127
0
    }
128
0
}
129
130
void ThreadedME::threadMain()
131
0
{
132
0
    while (m_active)
133
0
    {
134
0
        int newCTUsPushed = 0;
135
136
0
        for (int i = 0; i < m_param->frameNumThreads; i++)
137
0
        {
138
0
            FrameEncoder* frameEnc = m_enc.m_frameEncoder[i];
139
0
            frameEnc->m_tmeTasksLock.acquire();
140
141
0
            while (!frameEnc->m_tmeTasks.empty())
142
0
            {
143
0
                CTUTask task = frameEnc->m_tmeTasks.front();
144
0
                frameEnc->m_tmeTasks.pop();
145
146
0
                m_taskQueueLock.acquire();
147
0
                m_taskQueue.push(task);
148
0
                m_taskQueueLock.release();
149
150
0
                newCTUsPushed++;
151
0
                tryWakeOne();
152
0
            }
153
154
0
            frameEnc->m_tmeTasksLock.release();
155
0
        }
156
157
0
        if (newCTUsPushed == 0)
158
0
            m_taskEvent.wait();
159
0
    }
160
0
}
161
162
void ThreadedME::findJob(int workerThreadId)
163
0
{
164
0
    m_taskQueueLock.acquire();
165
0
    if (m_taskQueue.empty())
166
0
    {
167
0
        m_helpWanted = false;
168
0
        m_taskQueueLock.release();
169
0
        return;
170
0
    }
171
    
172
0
    m_helpWanted = true;
173
0
    int64_t stime = x265_mdate();
174
175
#ifdef DETAILED_CU_STATS
176
    ScopedElapsedTime tmeTime(m_tld[workerThreadId].analysis.m_stats[m_jpId].tmeTime);
177
    m_tld[workerThreadId].analysis.m_stats[m_jpId].countTmeTasks++;
178
#endif
179
180
0
    CTUTask task = m_taskQueue.top();
181
0
    m_taskQueue.pop();
182
0
    m_taskQueueLock.release();
183
184
0
    int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize;
185
0
    Frame* frame = task.frame;
186
187
0
    for (int i = 0; i < task.height; i++)
188
0
    {
189
0
        for (int j = 0; j < task.width; j++)
190
0
        {
191
192
0
            int ctuAddr = (task.row + i) * numCols + (task.col + j);
193
0
            CUData* ctu = frame->m_encData->getPicCTU(ctuAddr);
194
0
            ctu->m_slice = frame->m_encData->m_slice;
195
196
0
            task.ctu = ctu;
197
0
            task.geom = &task.frameEnc->m_cuGeoms[task.frameEnc->m_ctuGeomMap[ctuAddr]];
198
199
0
            frame->m_encData->m_cuStat[ctuAddr].baseQp = frame->m_encData->m_avgQpRc;
200
0
            initCTU(*ctu, task.row + i, task.col + j, task);
201
202
0
            task.frame->m_ctuMEFlags[ctuAddr].set(0);
203
0
            m_tld[workerThreadId].analysis.deriveMVsForCTU(*task.ctu, *task.geom, *frame);
204
205
0
            task.frame->m_ctuMEFlags[ctuAddr].set(1);
206
0
        }
207
0
    }
208
209
0
    if (m_param->csvLogLevel >= 2)
210
0
    {
211
0
        int64_t etime = x265_mdate();
212
0
        ATOMIC_ADD(&task.frameEnc->m_totalThreadedMETime[task.layer], etime - stime);
213
0
    }
214
215
0
    m_taskEvent.trigger();
216
0
}
217
218
219
void ThreadedME::stopJobs()
220
0
{
221
0
    this->m_active = false;
222
0
    m_taskEvent.trigger();
223
0
}
224
225
void ThreadedME::destroy()
226
0
{
227
0
    for (int i = 0; i < m_tldCount; i++)
228
0
        m_tld[i].destroy();
229
0
    delete[] m_tld;
230
0
}
231
232
void ThreadedME::collectStats()
233
0
{
234
#ifdef DETAILED_CU_STATS
235
    for (int i = 0; i < m_tldCount; i++)
236
        m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param);
237
#endif
238
0
}
239
240
void initCTU(CUData& ctu, int row, int col, CTUTask& task)
241
0
{
242
0
    Frame& frame = *task.frame;
243
0
    FrameEncoder& frameEnc = *task.frameEnc;
244
245
0
    int numRows = frameEnc.m_numRows;
246
0
    int numCols = frameEnc.m_numCols;
247
0
    Slice *slice = frame.m_encData->m_slice;
248
0
    CTURow& ctuRow = frameEnc.m_rows[row];
249
250
0
    const uint32_t bFirstRowInSlice = ((row == 0) || (frameEnc.m_rows[row - 1].sliceId != ctuRow.sliceId)) ? 1 : 0;
251
0
    const uint32_t bLastRowInSlice = ((row == numRows - 1) || (frameEnc.m_rows[row + 1].sliceId != ctuRow.sliceId)) ? 1 : 0;
252
253
0
    const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0;
254
255
0
    int ctuAddr = (numCols * row) + col;
256
257
0
    ctu.initCTU(frame, ctuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice);
258
0
}
259
260
}