/work/x265/source/encoder/threadedme.h
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * Copyright (C) 2013-2025 MulticoreWare, Inc |
3 | | * |
4 | | * Authors: Shashank Pathipati <shashank.pathipati@multicorewareinc.com> |
5 | | * Somu Vineela <somu@mutlicorewareinc.com> |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 2 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
20 | | * |
21 | | * This program is also available under a commercial proprietary license. |
22 | | * For more information, contact us at license @ x265.com. |
23 | | *****************************************************************************/ |
24 | | |
25 | | #ifndef THREADED_ME_H |
26 | | #define THREADED_ME_H |
27 | | |
28 | | #include "common.h" |
29 | | #include "threading.h" |
30 | | #include "threadpool.h" |
31 | | #include "cudata.h" |
32 | | #include "lowres.h" |
33 | | #include "frame.h" |
34 | | #include "analysis.h" |
35 | | #include "mv.h" |
36 | | |
37 | | #include <queue> |
38 | | #include <vector> |
39 | | #include <fstream> |
40 | | |
41 | | namespace X265_NS { |
42 | | |
43 | | extern int g_puStartIdx[128][8]; |
44 | | |
45 | | class Encoder; |
46 | | class Analysis; |
47 | | class FrameEncoder; |
48 | | |
49 | | struct PUBlock { |
50 | | uint32_t width; |
51 | | uint32_t height; |
52 | | PartSize partsize; |
53 | | bool isAmp; |
54 | | }; |
55 | | |
56 | | const PUBlock g_puLookup[MAX_NUM_PU_SIZES] = { |
57 | | { 8, 4, SIZE_2NxN, 0 }, |
58 | | { 4, 8, SIZE_Nx2N, 0 }, |
59 | | { 8, 8, SIZE_2Nx2N, 0 }, |
60 | | { 16, 4, SIZE_2NxnU, 1 }, |
61 | | { 16, 12, SIZE_2NxnD, 1 }, |
62 | | { 4, 16, SIZE_nLx2N, 1 }, |
63 | | { 12, 16, SIZE_nRx2N, 1 }, |
64 | | { 16, 8, SIZE_2NxN, 0 }, |
65 | | { 8, 16, SIZE_Nx2N, 0 }, |
66 | | { 16, 16, SIZE_2Nx2N, 0 }, |
67 | | { 32, 8, SIZE_2NxnU, 1 }, |
68 | | { 32, 24, SIZE_2NxnD, 1 }, |
69 | | { 8, 32, SIZE_nLx2N, 1 }, |
70 | | { 24, 32, SIZE_nRx2N, 1 }, |
71 | | { 32, 16, SIZE_2NxN, 0 }, |
72 | | { 16, 32, SIZE_Nx2N, 0 }, |
73 | | { 32, 32, SIZE_2Nx2N, 0 }, |
74 | | { 64, 16, SIZE_2NxnU, 1 }, |
75 | | { 64, 48, SIZE_2NxnD, 1 }, |
76 | | { 16, 64, SIZE_nLx2N, 1 }, |
77 | | { 48, 64, SIZE_nRx2N, 1 }, |
78 | | { 64, 32, SIZE_2NxN, 0 }, |
79 | | { 32, 64, SIZE_Nx2N, 0 }, |
80 | | { 64, 64, SIZE_2Nx2N, 0 } |
81 | | }; |
82 | | |
83 | | struct CTUTaskData |
84 | | { |
85 | | CUData& ctuData; |
86 | | CUGeom& ctuGeom; |
87 | | Frame& frame; |
88 | | }; |
89 | | |
90 | | struct CTUBlockTask |
91 | | { |
92 | | int row; |
93 | | int col; |
94 | | int width; |
95 | | int height; |
96 | | Frame* frame; |
97 | | class FrameEncoder* frameEnc; |
98 | | unsigned long long seq; /* monotonic sequence to preserve enqueue order */ |
99 | | }; |
100 | | |
101 | | struct PUData |
102 | | { |
103 | | PartSize part; |
104 | | const CUGeom* cuGeom; |
105 | | int puOffset; |
106 | | int areaId; |
107 | | int finalIdx; |
108 | | int qp; |
109 | | }; |
110 | | |
111 | | struct MEData |
112 | | { |
113 | | MV mv[2]; |
114 | | MV mvp[2]; |
115 | | uint32_t mvCost[2]; |
116 | | int ref[2]; |
117 | | int bits; |
118 | | uint32_t cost; |
119 | | }; |
120 | | |
121 | | struct CTUTask |
122 | | { |
123 | | uint64_t seq; |
124 | | int row; |
125 | | int col; |
126 | | int width; |
127 | | int height; |
128 | | int layer; |
129 | | |
130 | | CUData* ctu; |
131 | | CUGeom* geom; |
132 | | Frame* frame; |
133 | | FrameEncoder* frameEnc; |
134 | | }; |
135 | | |
136 | | |
137 | | struct CompareCTUTask { |
138 | 0 | bool operator()(const CTUTask& a, const CTUTask& b) const { |
139 | 0 | if (a.frame->m_poc == b.frame->m_poc) |
140 | 0 | { |
141 | 0 | int a_pos = a.row + a.col; |
142 | 0 | int b_pos = b.row + b.col; |
143 | 0 | if (a_pos != b_pos) return a_pos > b_pos; |
144 | 0 | } |
145 | | |
146 | | /* Compare by sequence number to preserve FIFO enqueue order. |
147 | | * priority_queue in C++ is a max-heap, so return true when a.seq > b.seq |
148 | | * to make smaller seq (earlier enqueue) the top() element. */ |
149 | 0 | return a.seq > b.seq; |
150 | 0 | } |
151 | | }; |
152 | | |
153 | | /** |
154 | | * @brief Threaded motion-estimation module that schedules CTU blocks across worker threads. |
155 | | * |
156 | | * Owns per-worker analysis state (ThreadLocalData), manages the CTU task queues, |
157 | | * and exposes a JobProvider interface for the thread pool to execute MVP |
158 | | * derivation and ME searches in parallel. |
159 | | */ |
160 | | class ThreadedME: public JobProvider, public Thread |
161 | | { |
162 | | public: |
163 | | x265_param* m_param; |
164 | | Encoder& m_enc; |
165 | | |
166 | | std::priority_queue<CTUTask, std::vector<CTUTask>, CompareCTUTask> m_taskQueue; |
167 | | Lock m_taskQueueLock; |
168 | | Event m_taskEvent; |
169 | | |
170 | | volatile bool m_active; |
171 | | unsigned long long m_enqueueSeq; |
172 | | |
173 | | ThreadLocalData* m_tld; |
174 | | int m_tldCount; |
175 | | |
176 | | #ifdef DETAILED_CU_STATS |
177 | | CUStats m_cuStats; |
178 | | #endif |
179 | | |
180 | | /** |
181 | | * @brief Construct the ThreadedME manager; call create() before use. |
182 | | */ |
183 | 0 | ThreadedME(x265_param* param, Encoder& enc): m_param(param), m_enc(enc) {}; |
184 | | |
185 | | /** |
186 | | * @brief Creates threadpool, thread local data and registers itself as a job provider |
187 | | */ |
188 | | bool create(); |
189 | | |
190 | | /** |
191 | | * @brief Initialize lookup table used to index PU offsets for all valid CTU sizes. |
192 | | */ |
193 | | void initPuStartIdx(); |
194 | | |
195 | | /** |
196 | | * @brief Enqueue a block of CTUs for motion estimation. |
197 | | * |
198 | | * Blocks are queued per FrameEncoder and later moved into the global |
199 | | * priority queue consumed by worker threads. |
200 | | */ |
201 | | void enqueueCTUBlock(int row, int col, int width, int height, int layer, FrameEncoder* frameEnc); |
202 | | |
203 | | /** |
204 | | * @brief Inspect dependency state and enqueue newly-unblocked CTU rows. |
205 | | * |
206 | | * Uses external (row-level) and internal (buffered-row) dependencies to |
207 | | * decide when a row can be split into CTU block tasks. |
208 | | */ |
209 | | void enqueueReadyRows(int row, int layer, FrameEncoder* frameEnc); |
210 | | |
211 | | /** |
212 | | * @brief Main dispatcher thread that transfers per-frame tasks into the global queue. |
213 | | */ |
214 | | void threadMain(); |
215 | | |
216 | | /** |
217 | | * @brief Dequeue a CTU task, derive MVs, and run ME over all supported PU shapes. |
218 | | * |
219 | | * Called by worker threads via JobProvider; processes an entire CTU block. |
220 | | */ |
221 | | void findJob(int workerThreadId); |
222 | | |
223 | | /** |
224 | | * @brief Stops worker threads |
225 | | */ |
226 | | void stopJobs(); |
227 | | |
228 | | /** |
229 | | * @brief Cleanup allocated resources |
230 | | */ |
231 | | void destroy(); |
232 | | |
233 | | /** |
234 | | * @brief Accumulate detailed CU statistics from worker thread local data. |
235 | | */ |
236 | | void collectStats(); |
237 | | }; |
238 | | |
239 | | // Utils |
240 | | |
241 | | /** |
242 | | * @brief A workaround to init CTUs before processRowEncoder does the same, |
243 | | * since the CUData is needed before the FrameEncoder initializes it |
244 | | */ |
245 | | void initCTU(CUData& ctu, int row, int col, CTUTask& task); |
246 | | |
247 | | }; |
248 | | |
249 | | #endif |