/src/ffmpeg/libavcodec/vvc/thread.c
Line | Count | Source |
1 | | /* |
2 | | * VVC thread logic |
3 | | * |
4 | | * Copyright (C) 2023 Nuo Mi |
5 | | * |
6 | | * This file is part of FFmpeg. |
7 | | * |
8 | | * FFmpeg is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * FFmpeg is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public |
19 | | * License along with FFmpeg; if not, write to the Free Software |
20 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 | | */ |
22 | | |
23 | | #include <stdatomic.h> |
24 | | |
25 | | #include "libavcodec/executor.h" |
26 | | #include "libavutil/mem.h" |
27 | | #include "libavutil/thread.h" |
28 | | |
29 | | #include "thread.h" |
30 | | #include "ctu.h" |
31 | | #include "filter.h" |
32 | | #include "inter.h" |
33 | | #include "intra.h" |
34 | | #include "refs.h" |
35 | | |
36 | | typedef struct ProgressListener { |
37 | | VVCProgressListener l; |
38 | | struct VVCTask *task; |
39 | | VVCContext *s; |
40 | | } ProgressListener; |
41 | | |
42 | | typedef enum VVCTaskStage { |
43 | | VVC_TASK_STAGE_INIT, // for CTU(0, 0) only |
44 | | VVC_TASK_STAGE_PARSE, |
45 | | VVC_TASK_STAGE_DEBLOCK_BS, |
46 | | VVC_TASK_STAGE_INTER, |
47 | | VVC_TASK_STAGE_RECON, |
48 | | VVC_TASK_STAGE_LMCS, |
49 | | VVC_TASK_STAGE_DEBLOCK_V, |
50 | | VVC_TASK_STAGE_DEBLOCK_H, |
51 | | VVC_TASK_STAGE_SAO, |
52 | | VVC_TASK_STAGE_ALF, |
53 | | VVC_TASK_STAGE_LAST |
54 | | } VVCTaskStage; |
55 | | |
56 | | typedef struct VVCTask { |
57 | | union { |
58 | | struct VVCTask *next; //for executor debug only |
59 | | FFTask task; |
60 | | } u; |
61 | | |
62 | | VVCTaskStage stage; |
63 | | |
64 | | // ctu x, y, and raster scan order |
65 | | int rx, ry, rs; |
66 | | VVCFrameContext *fc; |
67 | | |
68 | | ProgressListener col_listener; |
69 | | ProgressListener listener[2][VVC_MAX_REF_ENTRIES]; |
70 | | |
71 | | // for parse task only |
72 | | SliceContext *sc; |
73 | | EntryPoint *ep; |
74 | | int ctu_idx; //ctu idx in the current slice |
75 | | |
76 | | // tasks with target scores met are ready for scheduling |
77 | | atomic_uchar score[VVC_TASK_STAGE_LAST]; |
78 | | atomic_uchar target_inter_score; |
79 | | } VVCTask; |
80 | | |
81 | | typedef struct VVCRowThread { |
82 | | atomic_int col_progress[VVC_PROGRESS_LAST]; |
83 | | } VVCRowThread; |
84 | | |
85 | | typedef struct VVCFrameThread { |
86 | | // error return for tasks |
87 | | atomic_int ret; |
88 | | |
89 | | VVCRowThread *rows; |
90 | | VVCTask *tasks; |
91 | | |
92 | | int ctu_size; |
93 | | int ctu_width; |
94 | | int ctu_height; |
95 | | int ctu_count; |
96 | | |
97 | | //protected by lock |
98 | | atomic_int nb_scheduled_tasks; |
99 | | atomic_int nb_scheduled_listeners; |
100 | | |
101 | | int row_progress[VVC_PROGRESS_LAST]; |
102 | | |
103 | | AVMutex lock; |
104 | | AVCond cond; |
105 | | } VVCFrameThread; |
106 | | |
107 | 1.71M | #define PRIORITY_LOWEST 2 |
108 | | static void add_task(VVCContext *s, VVCTask *t) |
109 | 1.70M | { |
110 | 1.70M | VVCFrameThread *ft = t->fc->ft; |
111 | 1.70M | FFTask *task = &t->u.task; |
112 | 1.70M | const int priorities[] = { |
113 | 1.70M | 0, // VVC_TASK_STAGE_INIT, |
114 | 1.70M | 0, // VVC_TASK_STAGE_PARSE, |
115 | 1.70M | 1, // VVC_TASK_STAGE_DEBLOCK_BS |
116 | | // For an 8K clip, a CTU line completed in the reference frame may trigger 64 and more inter tasks. |
117 | | // We assign these tasks the lowest priority to avoid being overwhelmed with inter tasks. |
118 | 1.70M | PRIORITY_LOWEST, // VVC_TASK_STAGE_INTER |
119 | 1.70M | 1, // VVC_TASK_STAGE_RECON, |
120 | 1.70M | 1, // VVC_TASK_STAGE_LMCS, |
121 | 1.70M | 1, // VVC_TASK_STAGE_DEBLOCK_V, |
122 | 1.70M | 1, // VVC_TASK_STAGE_DEBLOCK_H, |
123 | 1.70M | 1, // VVC_TASK_STAGE_SAO, |
124 | 1.70M | 1, // VVC_TASK_STAGE_ALF, |
125 | 1.70M | }; |
126 | | |
127 | 1.70M | atomic_fetch_add(&ft->nb_scheduled_tasks, 1); |
128 | 1.70M | task->priority = priorities[t->stage]; |
129 | 1.70M | ff_executor_execute(s->executor, task); |
130 | 1.70M | } |
131 | | |
132 | | static void task_init(VVCTask *t, VVCTaskStage stage, VVCFrameContext *fc, const int rx, const int ry) |
133 | 3.25M | { |
134 | 3.25M | memset(t, 0, sizeof(*t)); |
135 | 3.25M | t->stage = stage; |
136 | 3.25M | t->fc = fc; |
137 | 3.25M | t->rx = rx; |
138 | 3.25M | t->ry = ry; |
139 | 3.25M | t->rs = ry * fc->ft->ctu_width + rx; |
140 | 35.7M | for (int i = 0; i < FF_ARRAY_ELEMS(t->score); i++) |
141 | 32.5M | atomic_store(t->score + i, 0); |
142 | 3.25M | atomic_store(&t->target_inter_score, 0); |
143 | 3.25M | } |
144 | | |
145 | | static int task_init_parse(VVCTask *t, SliceContext *sc, EntryPoint *ep, const int ctu_idx) |
146 | 1.66M | { |
147 | 1.66M | if (t->sc) { |
148 | | // the task already inited, error bitstream |
149 | 1.15k | return AVERROR_INVALIDDATA; |
150 | 1.15k | } |
151 | 1.66M | t->sc = sc; |
152 | 1.66M | t->ep = ep; |
153 | 1.66M | t->ctu_idx = ctu_idx; |
154 | | |
155 | 1.66M | return 0; |
156 | 1.66M | } |
157 | | |
158 | | static uint8_t task_add_score(VVCTask *t, const VVCTaskStage stage) |
159 | 41.7M | { |
160 | 41.7M | return atomic_fetch_add(&t->score[stage], 1) + 1; |
161 | 41.7M | } |
162 | | |
163 | | static uint8_t task_get_score(VVCTask *t, const VVCTaskStage stage) |
164 | 14.2M | { |
165 | 14.2M | return atomic_load(&t->score[stage]); |
166 | 14.2M | } |
167 | | |
168 | | //first row in tile or slice |
169 | | static int is_first_row(const VVCFrameContext *fc, const int rx, const int ry) |
170 | 1.56M | { |
171 | 1.56M | const VVCFrameThread *ft = fc->ft; |
172 | 1.56M | const VVCPPS *pps = fc->ps.pps; |
173 | | |
174 | 1.56M | if (ry != pps->ctb_to_row_bd[ry]) { |
175 | 0 | const int rs = ry * ft->ctu_width + rx; |
176 | 0 | return fc->tab.slice_idx[rs] != fc->tab.slice_idx[rs - ft->ctu_width]; |
177 | 0 | } |
178 | 1.56M | return 1; |
179 | 1.56M | } |
180 | | |
181 | | static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uint8_t score) |
182 | 55.9M | { |
183 | | // l:left, r:right, t: top, b: bottom |
184 | 55.9M | static const uint8_t target_score[] = |
185 | 55.9M | { |
186 | 55.9M | 2, //VVC_TASK_STAGE_DEBLOCK_BS,need l + t parse |
187 | 55.9M | 0, //VVC_TASK_STAGE_INTER, not used |
188 | 55.9M | 2, //VVC_TASK_STAGE_RECON, need l + rt recon |
189 | 55.9M | 3, //VVC_TASK_STAGE_LMCS, need r + b + rb recon |
190 | 55.9M | 1, //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v |
191 | 55.9M | 2, //VVC_TASK_STAGE_DEBLOCK_H, need r deblock v + t deblock h |
192 | 55.9M | 5, //VVC_TASK_STAGE_SAO, need l + r + lb + b + rb deblock h |
193 | 55.9M | 8, //VVC_TASK_STAGE_ALF, need sao around the ctu |
194 | 55.9M | }; |
195 | 55.9M | uint8_t target = 0; |
196 | 55.9M | VVCFrameContext *fc = t->fc; |
197 | | |
198 | 55.9M | if (stage == VVC_TASK_STAGE_INIT) |
199 | 1.53M | return 1; |
200 | | |
201 | 54.4M | if (stage == VVC_TASK_STAGE_PARSE) { |
202 | 4.78M | const H266RawSPS *rsps = fc->ps.sps->r; |
203 | 4.78M | const int wpp = rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, t->ry); |
204 | 4.78M | const int no_prev_stage = t->rs > 0; |
205 | 4.78M | target = 2 + wpp - no_prev_stage; //left parse + colocation + wpp - no_prev_stage |
206 | 49.6M | } else if (stage == VVC_TASK_STAGE_INTER) { |
207 | 1.62M | target = atomic_load(&t->target_inter_score); |
208 | 48.0M | } else { |
209 | 48.0M | target = target_score[stage - VVC_TASK_STAGE_DEBLOCK_BS]; |
210 | 48.0M | } |
211 | | |
212 | | //+1 for previous stage |
213 | 54.4M | av_assert0(score <= target + 1); |
214 | 54.4M | return score == target + 1; |
215 | 54.4M | } |
216 | | |
217 | | static void frame_thread_add_score(VVCContext *s, VVCFrameThread *ft, |
218 | | const int rx, const int ry, const VVCTaskStage stage) |
219 | 296M | { |
220 | 296M | VVCTask *t = ft->tasks + ft->ctu_width * ry + rx; |
221 | 296M | uint8_t score; |
222 | | |
223 | 296M | if (rx < 0 || rx >= ft->ctu_width || ry < 0 || ry >= ft->ctu_height) |
224 | 254M | return; |
225 | | |
226 | 41.7M | score = task_add_score(t, stage); |
227 | 41.7M | if (task_has_target_score(t, stage, score)) { |
228 | 1.70M | av_assert0(s); |
229 | 1.70M | av_assert0(stage == t->stage); |
230 | 1.70M | add_task(s, t); |
231 | 1.70M | } |
232 | 41.7M | } |
233 | | |
234 | | static void sheduled_done(VVCFrameThread *ft, atomic_int *scheduled) |
235 | 1.76M | { |
236 | 1.76M | if (atomic_fetch_sub(scheduled, 1) == 1) { |
237 | 1.59M | ff_mutex_lock(&ft->lock); |
238 | 1.59M | ff_cond_signal(&ft->cond); |
239 | 1.59M | ff_mutex_unlock(&ft->lock); |
240 | 1.59M | } |
241 | 1.76M | } |
242 | | |
243 | | static void progress_done(VVCProgressListener *_l, const int type) |
244 | 56.9k | { |
245 | 56.9k | const ProgressListener *l = (ProgressListener *)_l; |
246 | 56.9k | const VVCTask *t = l->task; |
247 | 56.9k | VVCFrameThread *ft = t->fc->ft; |
248 | | |
249 | 56.9k | frame_thread_add_score(l->s, ft, t->rx, t->ry, type); |
250 | 56.9k | sheduled_done(ft, &ft->nb_scheduled_listeners); |
251 | 56.9k | } |
252 | | |
253 | | static void pixel_done(VVCProgressListener *l) |
254 | 16.8k | { |
255 | 16.8k | progress_done(l, VVC_TASK_STAGE_INTER); |
256 | 16.8k | } |
257 | | |
258 | | static void mv_done(VVCProgressListener *l) |
259 | 40.0k | { |
260 | 40.0k | progress_done(l, VVC_TASK_STAGE_PARSE); |
261 | 40.0k | } |
262 | | |
263 | | static void listener_init(ProgressListener *l, VVCTask *t, VVCContext *s, const VVCProgress vp, const int y) |
264 | 56.9k | { |
265 | 56.9k | const int is_inter = vp == VVC_PROGRESS_PIXEL; |
266 | | |
267 | 56.9k | l->task = t; |
268 | 56.9k | l->s = s; |
269 | 56.9k | l->l.vp = vp; |
270 | 56.9k | l->l.y = y; |
271 | 56.9k | l->l.progress_done = is_inter ? pixel_done : mv_done; |
272 | 56.9k | if (is_inter) |
273 | 56.9k | atomic_fetch_add(&t->target_inter_score, 1); |
274 | 56.9k | } |
275 | | |
276 | | static void add_progress_listener(VVCFrame *ref, ProgressListener *l, |
277 | | VVCTask *t, VVCContext *s, const VVCProgress vp, const int y) |
278 | 56.9k | { |
279 | 56.9k | VVCFrameThread *ft = t->fc->ft; |
280 | | |
281 | 56.9k | atomic_fetch_add(&ft->nb_scheduled_listeners, 1); |
282 | 56.9k | listener_init(l, t, s, vp, y); |
283 | 56.9k | ff_vvc_add_progress_listener(ref, (VVCProgressListener*)l); |
284 | 56.9k | } |
285 | | |
286 | | static void ep_init_wpp(EntryPoint *next, const EntryPoint *ep, const VVCSPS *sps) |
287 | 0 | { |
288 | 0 | memcpy(next->cabac_state, ep->cabac_state, sizeof(next->cabac_state)); |
289 | 0 | memcpy(next->pp, ep->pp, sizeof(next->pp)); |
290 | 0 | ff_vvc_ep_init_stat_coeff(next, sps->bit_depth, sps->r->sps_persistent_rice_adaptation_enabled_flag); |
291 | 0 | } |
292 | | |
293 | | static void schedule_next_parse(VVCContext *s, VVCFrameContext *fc, const SliceContext *sc, const VVCTask *t) |
294 | 170k | { |
295 | 170k | VVCFrameThread *ft = fc->ft; |
296 | 170k | EntryPoint *ep = t->ep; |
297 | 170k | const VVCSPS *sps = fc->ps.sps; |
298 | | |
299 | 170k | if (sps->r->sps_entropy_coding_sync_enabled_flag) { |
300 | 24.5k | if (t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]) { |
301 | 24.5k | EntryPoint *next = ep + 1; |
302 | 24.5k | if (next < sc->eps + sc->nb_eps && !is_first_row(fc, t->rx, t->ry + 1)) |
303 | 0 | ep_init_wpp(next, ep, sps); |
304 | 24.5k | } |
305 | 24.5k | if (t->ry + 1 < ft->ctu_height && !is_first_row(fc, t->rx, t->ry + 1)) |
306 | 0 | frame_thread_add_score(s, ft, t->rx, t->ry + 1, VVC_TASK_STAGE_PARSE); |
307 | 24.5k | } |
308 | | |
309 | 170k | if (t->ctu_idx + 1 < t->ep->ctu_end) { |
310 | 74.7k | const int next_rs = sc->sh.ctb_addr_in_curr_slice[t->ctu_idx + 1]; |
311 | 74.7k | const int next_rx = next_rs % ft->ctu_width; |
312 | 74.7k | const int next_ry = next_rs / ft->ctu_width; |
313 | 74.7k | frame_thread_add_score(s, ft, next_rx, next_ry, VVC_TASK_STAGE_PARSE); |
314 | 74.7k | } |
315 | 170k | } |
316 | | |
317 | | static void schedule_inter(VVCContext *s, VVCFrameContext *fc, const SliceContext *sc, VVCTask *t, const int rs) |
318 | 170k | { |
319 | 170k | const VVCSH *sh = &sc->sh; |
320 | | |
321 | 170k | if (!IS_I(sh->r)) { |
322 | 18.8k | CTU *ctu = fc->tab.ctus + rs; |
323 | 56.5k | for (int lx = 0; lx < 2; lx++) { |
324 | 70.0k | for (int i = 0; i < sh->r->num_ref_idx_active[lx]; i++) { |
325 | 32.4k | int y = ctu->max_y[lx][i]; |
326 | 32.4k | VVCRefPic *refp = sc->rpl[lx].refs + i; |
327 | 32.4k | VVCFrame *ref = refp->ref; |
328 | 32.4k | if (ref && y >= 0) { |
329 | 16.8k | if (refp->is_scaled) |
330 | 0 | y = y * refp->scale[1] >> 14; |
331 | 16.8k | add_progress_listener(ref, &t->listener[lx][i], t, s, VVC_PROGRESS_PIXEL, y + LUMA_EXTRA_AFTER); |
332 | 16.8k | } |
333 | 32.4k | } |
334 | 37.6k | } |
335 | 18.8k | } |
336 | 170k | } |
337 | | |
338 | | static void parse_task_done(VVCContext *s, VVCFrameContext *fc, const int rx, const int ry) |
339 | 170k | { |
340 | 170k | VVCFrameThread *ft = fc->ft; |
341 | 170k | const int rs = ry * ft->ctu_width + rx; |
342 | 170k | const int slice_idx = fc->tab.slice_idx[rs]; |
343 | 170k | VVCTask *t = ft->tasks + rs; |
344 | 170k | const SliceContext *sc = fc->slices[slice_idx]; |
345 | | |
346 | 170k | schedule_next_parse(s, fc, sc, t); |
347 | 170k | schedule_inter(s, fc, sc, t, rs); |
348 | 170k | } |
349 | | |
350 | | static void task_stage_done(const VVCTask *t, VVCContext *s) |
351 | 115M | { |
352 | 115M | VVCFrameContext *fc = t->fc; |
353 | 115M | VVCFrameThread *ft = fc->ft; |
354 | 115M | const VVCTaskStage stage = t->stage; |
355 | | |
356 | 291M | #define ADD(dx, dy, stage) frame_thread_add_score(s, ft, t->rx + (dx), t->ry + (dy), stage) |
357 | | |
358 | | //this is a reserve map of ready_score, ordered by zigzag |
359 | 115M | if (stage == VVC_TASK_STAGE_PARSE) { |
360 | 12.7M | ADD( 0, 1, VVC_TASK_STAGE_DEBLOCK_BS); |
361 | 12.7M | ADD( 1, 0, VVC_TASK_STAGE_DEBLOCK_BS); |
362 | 12.7M | if (t->rx < 0 || t->rx >= ft->ctu_width || t->ry < 0 || t->ry >= ft->ctu_height) |
363 | 12.5M | return; |
364 | 170k | parse_task_done(s, fc, t->rx, t->ry); |
365 | 102M | } else if (stage == VVC_TASK_STAGE_RECON) { |
366 | 12.7M | ADD(-1, 1, VVC_TASK_STAGE_RECON); |
367 | 12.7M | ADD( 1, 0, VVC_TASK_STAGE_RECON); |
368 | 12.7M | ADD(-1, -1, VVC_TASK_STAGE_LMCS); |
369 | 12.7M | ADD( 0, -1, VVC_TASK_STAGE_LMCS); |
370 | 12.7M | ADD(-1, 0, VVC_TASK_STAGE_LMCS); |
371 | 90.1M | } else if (stage == VVC_TASK_STAGE_DEBLOCK_V) { |
372 | 12.6M | ADD( 1, 0, VVC_TASK_STAGE_DEBLOCK_V); |
373 | 12.6M | ADD(-1, 0, VVC_TASK_STAGE_DEBLOCK_H); |
374 | 77.4M | } else if (stage == VVC_TASK_STAGE_DEBLOCK_H) { |
375 | 12.6M | ADD( 0, 1, VVC_TASK_STAGE_DEBLOCK_H); |
376 | 12.6M | ADD(-1, -1, VVC_TASK_STAGE_SAO); |
377 | 12.6M | ADD( 0, -1, VVC_TASK_STAGE_SAO); |
378 | 12.6M | ADD(-1, 0, VVC_TASK_STAGE_SAO); |
379 | 12.6M | ADD( 1, -1, VVC_TASK_STAGE_SAO); |
380 | 12.6M | ADD( 1, 0, VVC_TASK_STAGE_SAO); |
381 | 64.8M | } else if (stage == VVC_TASK_STAGE_SAO) { |
382 | 12.6M | ADD(-1, -1, VVC_TASK_STAGE_ALF); |
383 | 12.6M | ADD( 0, -1, VVC_TASK_STAGE_ALF); |
384 | 12.6M | ADD(-1, 0, VVC_TASK_STAGE_ALF); |
385 | 12.6M | ADD( 1, -1, VVC_TASK_STAGE_ALF); |
386 | 12.6M | ADD(-1, 1, VVC_TASK_STAGE_ALF); |
387 | 12.6M | ADD( 1, 0, VVC_TASK_STAGE_ALF); |
388 | 12.6M | ADD( 0, 1, VVC_TASK_STAGE_ALF); |
389 | 12.6M | ADD( 1, 1, VVC_TASK_STAGE_ALF); |
390 | 12.6M | } |
391 | 115M | } |
392 | | |
393 | | static int task_is_stage_ready(VVCTask *t, int add) |
394 | 15.7M | { |
395 | 15.7M | const VVCTaskStage stage = t->stage; |
396 | 15.7M | uint8_t score; |
397 | 15.7M | if (stage > VVC_TASK_STAGE_ALF) |
398 | 1.54M | return 0; |
399 | 14.2M | score = task_get_score(t, stage) + add; |
400 | 14.2M | return task_has_target_score(t, stage, score); |
401 | 15.7M | } |
402 | | |
403 | | static void check_colocation(VVCContext *s, VVCTask *t) |
404 | 1.63M | { |
405 | 1.63M | const VVCFrameContext *fc = t->fc; |
406 | | |
407 | 1.63M | if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || fc->ps.sps->r->sps_sbtmvp_enabled_flag) { |
408 | 790k | VVCFrame *col = fc->ref->collocated_ref; |
409 | 790k | const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]; |
410 | 790k | if (col && first_col) { |
411 | | //we depend on bottom and right boundary, do not - 1 for y |
412 | 40.0k | const int y = (t->ry << fc->ps.sps->ctb_log2_size_y); |
413 | 40.0k | add_progress_listener(col, &t->col_listener, t, s, VVC_PROGRESS_MV, y); |
414 | 40.0k | return; |
415 | 40.0k | } |
416 | 790k | } |
417 | 1.59M | frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); |
418 | 1.59M | } |
419 | | |
420 | | static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, SliceContext *sc, EntryPoint *ep) |
421 | 1.53M | { |
422 | 1.53M | const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start]; |
423 | 1.53M | VVCTask *t = ft->tasks + rs; |
424 | | |
425 | 1.53M | frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); |
426 | 1.53M | } |
427 | | |
428 | | static int run_init(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
429 | 1.53M | { |
430 | 1.53M | VVCFrameContext *fc = lc->fc; |
431 | 1.53M | VVCFrameThread *ft = fc->ft; |
432 | 1.53M | const int ret = ff_vvc_per_frame_init(fc); |
433 | | |
434 | 1.53M | if (ret < 0) |
435 | 0 | return ret; |
436 | | |
437 | 3.07M | for (int i = 0; i < fc->nb_slices; i++) { |
438 | 1.53M | SliceContext *sc = fc->slices[i]; |
439 | 3.07M | for (int j = 0; j < sc->nb_eps; j++) { |
440 | 1.53M | EntryPoint *ep = sc->eps + j; |
441 | 3.16M | for (int k = ep->ctu_start; k < ep->ctu_end; k++) { |
442 | 1.63M | const int rs = sc->sh.ctb_addr_in_curr_slice[k]; |
443 | 1.63M | VVCTask *t = ft->tasks + rs; |
444 | 1.63M | check_colocation(s, t); |
445 | 1.63M | } |
446 | 1.53M | submit_entry_point(s, ft, sc, ep); |
447 | 1.53M | } |
448 | 1.53M | } |
449 | 1.53M | return 0; |
450 | 1.53M | } |
451 | | |
452 | | static void report_frame_progress(VVCFrameContext *fc, |
453 | | const int ry, const VVCProgress idx) |
454 | 272k | { |
455 | 272k | VVCFrameThread *ft = fc->ft; |
456 | 272k | const int ctu_size = ft->ctu_size; |
457 | 272k | int old; |
458 | | |
459 | 272k | if (atomic_fetch_add(&ft->rows[ry].col_progress[idx], 1) == ft->ctu_width - 1) { |
460 | 195k | int y; |
461 | 195k | ff_mutex_lock(&ft->lock); |
462 | 195k | y = old = ft->row_progress[idx]; |
463 | 391k | while (y < ft->ctu_height && atomic_load(&ft->rows[y].col_progress[idx]) == ft->ctu_width) |
464 | 195k | y++; |
465 | 195k | if (old != y) |
466 | 195k | ft->row_progress[idx] = y; |
467 | | // ff_vvc_report_progress will acquire other frames' locks, which could lead to a deadlock |
468 | | // We need to unlock ft->lock first |
469 | 195k | ff_mutex_unlock(&ft->lock); |
470 | | |
471 | 195k | if (old != y) { |
472 | 195k | const int progress = y == ft->ctu_height ? INT_MAX : y * ctu_size; |
473 | 195k | ff_vvc_report_progress(fc->ref, idx, progress); |
474 | 195k | } |
475 | 195k | } |
476 | 272k | } |
477 | | |
478 | | static int run_parse(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
479 | 1.61M | { |
480 | 1.61M | int ret; |
481 | 1.61M | VVCFrameContext *fc = lc->fc; |
482 | 1.61M | const int rs = t->rs; |
483 | 1.61M | const CTU *ctu = fc->tab.ctus + rs; |
484 | | |
485 | 1.61M | lc->ep = t->ep; |
486 | | |
487 | 1.61M | ret = ff_vvc_coding_tree_unit(lc, t->ctu_idx, rs, t->rx, t->ry); |
488 | 1.61M | if (ret < 0) |
489 | 1.44M | return ret; |
490 | | |
491 | 170k | if (!ctu->has_dmvr) |
492 | 170k | report_frame_progress(lc->fc, t->ry, VVC_PROGRESS_MV); |
493 | | |
494 | 170k | return 0; |
495 | 1.61M | } |
496 | | |
497 | | static int run_deblock_bs(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
498 | 170k | { |
499 | 170k | if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) |
500 | 163k | ff_vvc_deblock_bs(lc, t->rx, t->ry, t->rs); |
501 | | |
502 | 170k | return 0; |
503 | 170k | } |
504 | | |
505 | | static int run_inter(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
506 | 170k | { |
507 | 170k | VVCFrameContext *fc = lc->fc; |
508 | 170k | const CTU *ctu = fc->tab.ctus + t->rs; |
509 | 170k | int ret; |
510 | | |
511 | 170k | ret = ff_vvc_predict_inter(lc, t->rs); |
512 | 170k | if (ret < 0) |
513 | 0 | return ret; |
514 | | |
515 | 170k | if (ctu->has_dmvr) |
516 | 0 | report_frame_progress(fc, t->ry, VVC_PROGRESS_MV); |
517 | | |
518 | 170k | return 0; |
519 | 170k | } |
520 | | |
521 | | static int run_recon(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
522 | 170k | { |
523 | 170k | return ff_vvc_reconstruct(lc, t->rs, t->rx, t->ry); |
524 | 170k | } |
525 | | |
526 | | static int run_lmcs(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
527 | 101k | { |
528 | 101k | VVCFrameContext *fc = lc->fc; |
529 | 101k | VVCFrameThread *ft = fc->ft; |
530 | 101k | const int ctu_size = ft->ctu_size; |
531 | 101k | const int x0 = t->rx * ctu_size; |
532 | 101k | const int y0 = t->ry * ctu_size; |
533 | | |
534 | 101k | ff_vvc_lmcs_filter(lc, x0, y0); |
535 | | |
536 | 101k | return 0; |
537 | 101k | } |
538 | | |
539 | | static int run_deblock_v(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
540 | 101k | { |
541 | 101k | VVCFrameContext *fc = lc->fc; |
542 | 101k | VVCFrameThread *ft = fc->ft; |
543 | 101k | const int ctb_size = ft->ctu_size; |
544 | 101k | const int x0 = t->rx * ctb_size; |
545 | 101k | const int y0 = t->ry * ctb_size; |
546 | | |
547 | 101k | if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) { |
548 | 95.7k | ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs); |
549 | 95.7k | ff_vvc_deblock_vertical(lc, x0, y0, t->rs); |
550 | 95.7k | } |
551 | | |
552 | 101k | return 0; |
553 | 101k | } |
554 | | |
555 | | static int run_deblock_h(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
556 | 101k | { |
557 | 101k | VVCFrameContext *fc = lc->fc; |
558 | 101k | VVCFrameThread *ft = fc->ft; |
559 | 101k | const int ctb_size = ft->ctu_size; |
560 | 101k | const int x0 = t->rx * ctb_size; |
561 | 101k | const int y0 = t->ry * ctb_size; |
562 | | |
563 | 101k | if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) { |
564 | 95.7k | ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs); |
565 | 95.7k | ff_vvc_deblock_horizontal(lc, x0, y0, t->rs); |
566 | 95.7k | } |
567 | 101k | if (fc->ps.sps->r->sps_sao_enabled_flag) |
568 | 58.8k | ff_vvc_sao_copy_ctb_to_hv(lc, t->rx, t->ry, t->ry == ft->ctu_height - 1); |
569 | | |
570 | 101k | return 0; |
571 | 101k | } |
572 | | |
573 | | static int run_sao(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
574 | 101k | { |
575 | 101k | VVCFrameContext *fc = lc->fc; |
576 | 101k | VVCFrameThread *ft = fc->ft; |
577 | 101k | const int ctb_size = ft->ctu_size; |
578 | 101k | const int x0 = t->rx * ctb_size; |
579 | 101k | const int y0 = t->ry * ctb_size; |
580 | | |
581 | 101k | if (fc->ps.sps->r->sps_sao_enabled_flag) { |
582 | 58.8k | ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs); |
583 | 58.8k | ff_vvc_sao_filter(lc, x0, y0); |
584 | 58.8k | } |
585 | | |
586 | 101k | if (fc->ps.sps->r->sps_alf_enabled_flag) |
587 | 49.4k | ff_vvc_alf_copy_ctu_to_hv(lc, x0, y0); |
588 | | |
589 | 101k | return 0; |
590 | 101k | } |
591 | | |
592 | | static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t) |
593 | 101k | { |
594 | 101k | VVCFrameContext *fc = lc->fc; |
595 | 101k | VVCFrameThread *ft = fc->ft; |
596 | 101k | const int ctu_size = ft->ctu_size; |
597 | 101k | const int x0 = t->rx * ctu_size; |
598 | 101k | const int y0 = t->ry * ctu_size; |
599 | | |
600 | 101k | if (fc->ps.sps->r->sps_alf_enabled_flag) { |
601 | 49.4k | ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs); |
602 | 49.4k | ff_vvc_alf_filter(lc, x0, y0); |
603 | 49.4k | } |
604 | 101k | report_frame_progress(fc, t->ry, VVC_PROGRESS_PIXEL); |
605 | | |
606 | 101k | return 0; |
607 | 101k | } |
608 | | |
609 | | const static char* task_name[] = { |
610 | | "INIT", |
611 | | "P", |
612 | | "B", |
613 | | "I", |
614 | | "R", |
615 | | "L", |
616 | | "V", |
617 | | "H", |
618 | | "S", |
619 | | "A" |
620 | | }; |
621 | | |
622 | | typedef int (*run_func)(VVCContext *s, VVCLocalContext *lc, VVCTask *t); |
623 | | |
624 | | static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc) |
625 | 15.7M | { |
626 | 15.7M | int ret; |
627 | 15.7M | VVCFrameContext *fc = t->fc; |
628 | 15.7M | VVCFrameThread *ft = fc->ft; |
629 | 15.7M | const VVCTaskStage stage = t->stage; |
630 | 15.7M | static const run_func run[] = { |
631 | 15.7M | run_init, |
632 | 15.7M | run_parse, |
633 | 15.7M | run_deblock_bs, |
634 | 15.7M | run_inter, |
635 | 15.7M | run_recon, |
636 | 15.7M | run_lmcs, |
637 | 15.7M | run_deblock_v, |
638 | 15.7M | run_deblock_h, |
639 | 15.7M | run_sao, |
640 | 15.7M | run_alf, |
641 | 15.7M | }; |
642 | | |
643 | 15.7M | ff_dlog(s->avctx, "frame %5d, %s(%3d, %3d)\r\n", (int)t->fc->decode_order, task_name[stage], t->rx, t->ry); |
644 | | |
645 | 15.7M | lc->sc = t->sc; |
646 | | |
647 | 15.7M | if (!atomic_load(&ft->ret)) { |
648 | 4.16M | if ((ret = run[stage](s, lc, t)) < 0) { |
649 | | #ifdef COMPAT_ATOMICS_WIN32_STDATOMIC_H |
650 | | intptr_t zero = 0; |
651 | | #else |
652 | 1.44M | int zero = 0; |
653 | 1.44M | #endif |
654 | 1.44M | atomic_compare_exchange_strong(&ft->ret, &zero, ret); |
655 | 1.44M | av_log(s->avctx, AV_LOG_ERROR, |
656 | 1.44M | "frame %5d, %s(%3d, %3d) failed with %d\r\n", |
657 | 1.44M | (int)fc->decode_order, task_name[stage], t->rx, t->ry, ret); |
658 | 1.44M | } |
659 | 4.16M | if (!ret) |
660 | 2.72M | task_stage_done(t, s); |
661 | 4.16M | } |
662 | 15.7M | return; |
663 | 15.7M | } |
664 | | |
665 | | static int task_run(FFTask *_t, void *local_context, void *user_data) |
666 | 1.70M | { |
667 | 1.70M | VVCTask *t = (VVCTask*)_t; |
668 | 1.70M | VVCContext *s = (VVCContext *)user_data; |
669 | 1.70M | VVCLocalContext *lc = local_context; |
670 | 1.70M | VVCFrameThread *ft = t->fc->ft; |
671 | | |
672 | 1.70M | lc->fc = t->fc; |
673 | | |
674 | 15.7M | do { |
675 | 15.7M | task_run_stage(t, s, lc); |
676 | 15.7M | t->stage++; |
677 | 15.7M | } while (task_is_stage_ready(t, 1)); |
678 | | |
679 | 1.70M | if (t->stage != VVC_TASK_STAGE_LAST) |
680 | 161k | frame_thread_add_score(s, ft, t->rx, t->ry, t->stage); |
681 | | |
682 | 1.70M | sheduled_done(ft, &ft->nb_scheduled_tasks); |
683 | | |
684 | 1.70M | return 0; |
685 | 1.70M | } |
686 | | |
687 | | av_cold FFExecutor* ff_vvc_executor_alloc(VVCContext *s, const int thread_count) |
688 | 15.4k | { |
689 | 15.4k | FFTaskCallbacks callbacks = { |
690 | 15.4k | s, |
691 | 15.4k | sizeof(VVCLocalContext), |
692 | 15.4k | PRIORITY_LOWEST + 1, |
693 | 15.4k | task_run, |
694 | 15.4k | }; |
695 | 15.4k | return ff_executor_alloc(&callbacks, thread_count); |
696 | 15.4k | } |
697 | | |
698 | | av_cold void ff_vvc_executor_free(FFExecutor **e) |
699 | 15.4k | { |
700 | 15.4k | ff_executor_free(e); |
701 | 15.4k | } |
702 | | |
703 | | void ff_vvc_frame_thread_free(VVCFrameContext *fc) |
704 | 316k | { |
705 | 316k | VVCFrameThread *ft = fc->ft; |
706 | | |
707 | 316k | if (!ft) |
708 | 246k | return; |
709 | | |
710 | 69.8k | ff_mutex_destroy(&ft->lock); |
711 | 69.8k | ff_cond_destroy(&ft->cond); |
712 | 69.8k | av_freep(&ft->rows); |
713 | 69.8k | av_freep(&ft->tasks); |
714 | 69.8k | av_freep(&ft); |
715 | 69.8k | } |
716 | | |
717 | | static void frame_thread_init_score(VVCFrameContext *fc) |
718 | 1.56M | { |
719 | 1.56M | const VVCFrameThread *ft = fc->ft; |
720 | 1.56M | VVCTask task; |
721 | | |
722 | 1.56M | task_init(&task, VVC_TASK_STAGE_PARSE, fc, 0, 0); |
723 | | |
724 | 15.6M | for (int i = VVC_TASK_STAGE_PARSE; i < VVC_TASK_STAGE_LAST; i++) { |
725 | 14.0M | task.stage = i; |
726 | | |
727 | 56.3M | for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) { |
728 | 42.2M | task.ry = -1; //top |
729 | 42.2M | task_stage_done(&task, NULL); |
730 | 42.2M | task.ry = ft->ctu_height; //bottom |
731 | 42.2M | task_stage_done(&task, NULL); |
732 | 42.2M | } |
733 | | |
734 | 28.1M | for (task.ry = 0; task.ry < ft->ctu_height; task.ry++) { |
735 | 14.1M | task.rx = -1; //left |
736 | 14.1M | task_stage_done(&task, NULL); |
737 | 14.1M | task.rx = ft->ctu_width; //right |
738 | 14.1M | task_stage_done(&task, NULL); |
739 | 14.1M | } |
740 | 14.0M | } |
741 | 1.56M | } |
742 | | |
743 | | int ff_vvc_frame_thread_init(VVCFrameContext *fc) |
744 | 1.56M | { |
745 | 1.56M | const VVCSPS *sps = fc->ps.sps; |
746 | 1.56M | const VVCPPS *pps = fc->ps.pps; |
747 | 1.56M | VVCFrameThread *ft = fc->ft; |
748 | 1.56M | int ret; |
749 | | |
750 | 1.56M | if (!ft || ft->ctu_width != pps->ctb_width || |
751 | 1.49M | ft->ctu_height != pps->ctb_height || |
752 | 1.49M | ft->ctu_size != sps->ctb_size_y) { |
753 | | |
754 | 69.8k | ff_vvc_frame_thread_free(fc); |
755 | 69.8k | ft = av_calloc(1, sizeof(*fc->ft)); |
756 | 69.8k | if (!ft) |
757 | 0 | return AVERROR(ENOMEM); |
758 | | |
759 | 69.8k | ft->ctu_width = fc->ps.pps->ctb_width; |
760 | 69.8k | ft->ctu_height = fc->ps.pps->ctb_height; |
761 | 69.8k | ft->ctu_count = fc->ps.pps->ctb_count; |
762 | 69.8k | ft->ctu_size = fc->ps.sps->ctb_size_y; |
763 | | |
764 | 69.8k | ft->rows = av_calloc(ft->ctu_height, sizeof(*ft->rows)); |
765 | 69.8k | if (!ft->rows) |
766 | 0 | goto fail; |
767 | | |
768 | 69.8k | ft->tasks = av_malloc(ft->ctu_count * sizeof(*ft->tasks)); |
769 | 69.8k | if (!ft->tasks) |
770 | 0 | goto fail; |
771 | | |
772 | 69.8k | if ((ret = ff_cond_init(&ft->cond, NULL))) |
773 | 0 | goto fail; |
774 | | |
775 | 69.8k | if ((ret = ff_mutex_init(&ft->lock, NULL))) { |
776 | 0 | ff_cond_destroy(&ft->cond); |
777 | 0 | goto fail; |
778 | 0 | } |
779 | 69.8k | } |
780 | 1.56M | fc->ft = ft; |
781 | 1.56M | ft->ret = 0; |
782 | 3.13M | for (int y = 0; y < ft->ctu_height; y++) { |
783 | 1.56M | VVCRowThread *row = ft->rows + y; |
784 | 1.56M | memset(row->col_progress, 0, sizeof(row->col_progress)); |
785 | 1.56M | } |
786 | | |
787 | 3.25M | for (int rs = 0; rs < ft->ctu_count; rs++) { |
788 | 1.69M | VVCTask *t = ft->tasks + rs; |
789 | 1.69M | task_init(t, rs ? VVC_TASK_STAGE_PARSE : VVC_TASK_STAGE_INIT, fc, rs % ft->ctu_width, rs / ft->ctu_width); |
790 | 1.69M | } |
791 | | |
792 | 1.56M | memset(&ft->row_progress[0], 0, sizeof(ft->row_progress)); |
793 | | |
794 | 1.56M | frame_thread_init_score(fc); |
795 | | |
796 | 1.56M | return 0; |
797 | | |
798 | 0 | fail: |
799 | 0 | if (ft) { |
800 | 0 | av_freep(&ft->rows); |
801 | 0 | av_freep(&ft->tasks); |
802 | 0 | av_freep(&ft); |
803 | 0 | } |
804 | |
|
805 | 0 | return AVERROR(ENOMEM); |
806 | 1.56M | } |
807 | | |
808 | | int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc) |
809 | 1.53M | { |
810 | 1.53M | VVCFrameThread *ft = fc->ft; |
811 | | |
812 | 3.07M | for (int i = 0; i < fc->nb_slices; i++) { |
813 | 1.53M | SliceContext *sc = fc->slices[i]; |
814 | 3.07M | for (int j = 0; j < sc->nb_eps; j++) { |
815 | 1.53M | EntryPoint *ep = sc->eps + j; |
816 | 3.20M | for (int k = ep->ctu_start; k < ep->ctu_end; k++) { |
817 | 1.66M | const int rs = sc->sh.ctb_addr_in_curr_slice[k]; |
818 | 1.66M | VVCTask *t = ft->tasks + rs; |
819 | 1.66M | const int ret = task_init_parse(t, sc, ep, k); |
820 | 1.66M | if (ret < 0) |
821 | 1.15k | return ret; |
822 | 1.66M | } |
823 | 1.53M | } |
824 | 1.53M | } |
825 | 3.16M | for (int rs = 0; rs < ft->ctu_count; rs++) { |
826 | 1.63M | const VVCTask *t = ft->tasks + rs; |
827 | 1.63M | if (!t->sc) { |
828 | 0 | av_log(s->avctx, AV_LOG_ERROR, "frame %5d, CTU(%d, %d) not belong to any slice\r\n", (int)fc->decode_order, t->rx, t->ry); |
829 | 0 | return AVERROR_INVALIDDATA; |
830 | 0 | } |
831 | 1.63M | } |
832 | 1.53M | frame_thread_add_score(s, ft, 0, 0, VVC_TASK_STAGE_INIT); |
833 | | |
834 | 1.53M | return 0; |
835 | 1.53M | } |
836 | | |
837 | | int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc) |
838 | 1.53M | { |
839 | 1.53M | VVCFrameThread *ft = fc->ft; |
840 | | |
841 | 1.53M | ff_mutex_lock(&ft->lock); |
842 | | |
843 | 1.53M | while (atomic_load(&ft->nb_scheduled_tasks) || atomic_load(&ft->nb_scheduled_listeners)) |
844 | 0 | ff_cond_wait(&ft->cond, &ft->lock); |
845 | | |
846 | 1.53M | ff_mutex_unlock(&ft->lock); |
847 | 1.53M | ff_vvc_report_frame_finished(fc->ref); |
848 | | |
849 | 1.53M | ff_dlog(s->avctx, "frame %5d done\r\n", (int)fc->decode_order); |
850 | 1.53M | return ft->ret; |
851 | 1.53M | } |