/work/x265/source/encoder/threadedme.cpp
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2025 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Shashank Pathipati <shashank.pathipati@multicorewareinc.com> |
5 | | * Somu Vineela <somu@mutlicorewareinc.com> |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 2 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
20 | | * |
21 | | * This program is also available under a commercial proprietary license. |
22 | | * For more information, contact us at license @ x265.com. |
23 | | *****************************************************************************/ |
24 | | |
25 | | #include "threadedme.h" |
26 | | #include "encoder.h" |
27 | | #include "frameencoder.h" |
28 | | |
29 | | #include <iostream> |
30 | | #include <sstream> |
31 | | |
32 | | namespace X265_NS { |
33 | | int g_puStartIdx[128][8] = {0}; |
34 | | |
35 | | bool ThreadedME::create() |
36 | 0 | { |
37 | 0 | m_active = true; |
38 | 0 | m_tldCount = m_pool->m_numWorkers; |
39 | 0 | m_tld = new ThreadLocalData[m_tldCount]; |
40 | 0 | for (int i = 0; i < m_tldCount; i++) |
41 | 0 | { |
42 | 0 | m_tld[i].analysis.initSearch(*m_param, m_enc.m_scalingList); |
43 | 0 | m_tld[i].analysis.create(m_tld); |
44 | 0 | } |
45 | |
|
46 | 0 | initPuStartIdx(); |
47 | | |
48 | | /* start sequence at zero */ |
49 | 0 | m_enqueueSeq = 0ULL; |
50 | |
|
51 | 0 | return true; |
52 | 0 | } |
53 | | |
54 | | void ThreadedME::initPuStartIdx() |
55 | 0 | { |
56 | 0 | int startIdx = 0; |
57 | 0 | uint32_t ctuSize = m_param->maxCUSize; |
58 | |
|
59 | 0 | for (uint32_t puIdx = 0; puIdx < MAX_NUM_PU_SIZES; ++puIdx) |
60 | 0 | { |
61 | 0 | const PUBlock& pu = g_puLookup[puIdx]; |
62 | |
|
63 | 0 | if (pu.width > ctuSize || pu.height > ctuSize) |
64 | 0 | continue; |
65 | | |
66 | 0 | int indexWidth = pu.isAmp ? X265_MAX(pu.width, pu.height) : pu.width; |
67 | 0 | int indexHeight = pu.isAmp ? indexWidth : pu.height; |
68 | |
|
69 | 0 | int numPUs = (ctuSize / indexWidth) * (ctuSize / indexHeight); |
70 | 0 | int partIdx = static_cast<int>(pu.partsize); |
71 | |
|
72 | 0 | g_puStartIdx[pu.width + pu.height][partIdx] = startIdx; |
73 | |
|
74 | 0 | startIdx += pu.isAmp ? 2 * numPUs : numPUs; |
75 | 0 | } |
76 | 0 | } |
77 | | |
78 | | void ThreadedME::enqueueCTUBlock(int row, int col, int width, int height, int layer, FrameEncoder* frameEnc) |
79 | 0 | { |
80 | 0 | frameEnc->m_tmeTasksLock.acquire(); |
81 | |
|
82 | 0 | Frame* frame = frameEnc->m_frame[layer]; |
83 | |
|
84 | 0 | CTUTask task; |
85 | 0 | task.seq = ATOMIC_ADD(&m_enqueueSeq, 1ULL); |
86 | 0 | task.row = row; |
87 | 0 | task.col = col; |
88 | 0 | task.width = width; |
89 | 0 | task.height = height; |
90 | 0 | task.layer = layer; |
91 | |
|
92 | 0 | task.frame = frame; |
93 | 0 | task.frameEnc = frameEnc; |
94 | |
|
95 | 0 | frameEnc->m_tmeTasks.push(task); |
96 | 0 | frameEnc->m_tmeTasksLock.release(); |
97 | |
|
98 | 0 | m_taskEvent.trigger(); |
99 | 0 | } |
100 | | |
101 | | void ThreadedME::enqueueReadyRows(int row, int layer, FrameEncoder* frameEnc) |
102 | 0 | { |
103 | 0 | int bufRow = X265_MIN(row + m_param->tmeNumBufferRows, static_cast<int>(frameEnc->m_numRows)); |
104 | |
|
105 | 0 | for (int r = 0; r < bufRow; r++) |
106 | 0 | { |
107 | 0 | if (frameEnc->m_tmeDeps[r].isQueued) |
108 | 0 | continue; |
109 | | |
110 | 0 | bool isInitialRow = r < m_param->tmeNumBufferRows; |
111 | 0 | bool isExternalDepResolved = frameEnc->m_tmeDeps[r].external; |
112 | |
|
113 | 0 | int prevRow = X265_MAX(0, r - m_param->tmeNumBufferRows); |
114 | 0 | bool isInternalDepResolved = frameEnc->m_tmeDeps[prevRow].internal; |
115 | |
|
116 | 0 | if ((isInitialRow && isExternalDepResolved) || |
117 | 0 | (!isInitialRow && isExternalDepResolved && isInternalDepResolved)) |
118 | 0 | { |
119 | 0 | int cols = static_cast<int>(frameEnc->m_numCols); |
120 | 0 | for (int c = 0; c < cols; c += m_param->tmeTaskBlockSize) |
121 | 0 | { |
122 | 0 | int blockWidth = X265_MIN(m_param->tmeTaskBlockSize, cols - c); |
123 | 0 | enqueueCTUBlock(r, c, blockWidth, 1, layer, frameEnc); |
124 | 0 | } |
125 | 0 | frameEnc->m_tmeDeps[r].isQueued = true; |
126 | 0 | } |
127 | 0 | } |
128 | 0 | } |
129 | | |
130 | | void ThreadedME::threadMain() |
131 | 0 | { |
132 | 0 | while (m_active) |
133 | 0 | { |
134 | 0 | int newCTUsPushed = 0; |
135 | |
|
136 | 0 | for (int i = 0; i < m_param->frameNumThreads; i++) |
137 | 0 | { |
138 | 0 | FrameEncoder* frameEnc = m_enc.m_frameEncoder[i]; |
139 | 0 | frameEnc->m_tmeTasksLock.acquire(); |
140 | |
|
141 | 0 | while (!frameEnc->m_tmeTasks.empty()) |
142 | 0 | { |
143 | 0 | CTUTask task = frameEnc->m_tmeTasks.front(); |
144 | 0 | frameEnc->m_tmeTasks.pop(); |
145 | |
|
146 | 0 | m_taskQueueLock.acquire(); |
147 | 0 | m_taskQueue.push(task); |
148 | 0 | m_taskQueueLock.release(); |
149 | |
|
150 | 0 | newCTUsPushed++; |
151 | 0 | tryWakeOne(); |
152 | 0 | } |
153 | |
|
154 | 0 | frameEnc->m_tmeTasksLock.release(); |
155 | 0 | } |
156 | |
|
157 | 0 | if (newCTUsPushed == 0) |
158 | 0 | m_taskEvent.wait(); |
159 | 0 | } |
160 | 0 | } |
161 | | |
162 | | void ThreadedME::findJob(int workerThreadId) |
163 | 0 | { |
164 | 0 | m_taskQueueLock.acquire(); |
165 | 0 | if (m_taskQueue.empty()) |
166 | 0 | { |
167 | 0 | m_helpWanted = false; |
168 | 0 | m_taskQueueLock.release(); |
169 | 0 | return; |
170 | 0 | } |
171 | | |
172 | 0 | m_helpWanted = true; |
173 | 0 | int64_t stime = x265_mdate(); |
174 | |
|
175 | | #ifdef DETAILED_CU_STATS |
176 | | ScopedElapsedTime tmeTime(m_tld[workerThreadId].analysis.m_stats[m_jpId].tmeTime); |
177 | | m_tld[workerThreadId].analysis.m_stats[m_jpId].countTmeTasks++; |
178 | | #endif |
179 | |
|
180 | 0 | CTUTask task = m_taskQueue.top(); |
181 | 0 | m_taskQueue.pop(); |
182 | 0 | m_taskQueueLock.release(); |
183 | |
|
184 | 0 | int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; |
185 | 0 | Frame* frame = task.frame; |
186 | |
|
187 | 0 | for (int i = 0; i < task.height; i++) |
188 | 0 | { |
189 | 0 | for (int j = 0; j < task.width; j++) |
190 | 0 | { |
191 | |
|
192 | 0 | int ctuAddr = (task.row + i) * numCols + (task.col + j); |
193 | 0 | CUData* ctu = frame->m_encData->getPicCTU(ctuAddr); |
194 | 0 | ctu->m_slice = frame->m_encData->m_slice; |
195 | |
|
196 | 0 | task.ctu = ctu; |
197 | 0 | task.geom = &task.frameEnc->m_cuGeoms[task.frameEnc->m_ctuGeomMap[ctuAddr]]; |
198 | |
|
199 | 0 | frame->m_encData->m_cuStat[ctuAddr].baseQp = frame->m_encData->m_avgQpRc; |
200 | 0 | initCTU(*ctu, task.row + i, task.col + j, task); |
201 | |
|
202 | 0 | task.frame->m_ctuMEFlags[ctuAddr].set(0); |
203 | 0 | m_tld[workerThreadId].analysis.deriveMVsForCTU(*task.ctu, *task.geom, *frame); |
204 | |
|
205 | 0 | task.frame->m_ctuMEFlags[ctuAddr].set(1); |
206 | 0 | } |
207 | 0 | } |
208 | |
|
209 | 0 | if (m_param->csvLogLevel >= 2) |
210 | 0 | { |
211 | 0 | int64_t etime = x265_mdate(); |
212 | 0 | ATOMIC_ADD(&task.frameEnc->m_totalThreadedMETime[task.layer], etime - stime); |
213 | 0 | } |
214 | |
|
215 | 0 | m_taskEvent.trigger(); |
216 | 0 | } |
217 | | |
218 | | |
219 | | void ThreadedME::stopJobs() |
220 | 0 | { |
221 | 0 | this->m_active = false; |
222 | 0 | m_taskEvent.trigger(); |
223 | 0 | } |
224 | | |
225 | | void ThreadedME::destroy() |
226 | 0 | { |
227 | 0 | for (int i = 0; i < m_tldCount; i++) |
228 | 0 | m_tld[i].destroy(); |
229 | 0 | delete[] m_tld; |
230 | 0 | } |
231 | | |
232 | | void ThreadedME::collectStats() |
233 | 0 | { |
234 | | #ifdef DETAILED_CU_STATS |
235 | | for (int i = 0; i < m_tldCount; i++) |
236 | | m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param); |
237 | | #endif |
238 | 0 | } |
239 | | |
240 | | void initCTU(CUData& ctu, int row, int col, CTUTask& task) |
241 | 0 | { |
242 | 0 | Frame& frame = *task.frame; |
243 | 0 | FrameEncoder& frameEnc = *task.frameEnc; |
244 | |
|
245 | 0 | int numRows = frameEnc.m_numRows; |
246 | 0 | int numCols = frameEnc.m_numCols; |
247 | 0 | Slice *slice = frame.m_encData->m_slice; |
248 | 0 | CTURow& ctuRow = frameEnc.m_rows[row]; |
249 | |
|
250 | 0 | const uint32_t bFirstRowInSlice = ((row == 0) || (frameEnc.m_rows[row - 1].sliceId != ctuRow.sliceId)) ? 1 : 0; |
251 | 0 | const uint32_t bLastRowInSlice = ((row == numRows - 1) || (frameEnc.m_rows[row + 1].sliceId != ctuRow.sliceId)) ? 1 : 0; |
252 | |
|
253 | 0 | const uint32_t bLastCuInSlice = (bLastRowInSlice & (col == numCols - 1)) ? 1 : 0; |
254 | |
|
255 | 0 | int ctuAddr = (numCols * row) + col; |
256 | |
|
257 | 0 | ctu.initCTU(frame, ctuAddr, slice->m_sliceQp, bFirstRowInSlice, bLastRowInSlice, bLastCuInSlice); |
258 | 0 | } |
259 | | |
260 | | } |