/src/ffmpeg/libswscale/ops_dispatch.c
Line | Count | Source |
1 | | /** |
2 | | * Copyright (C) 2025 Niklas Haas |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include "libavutil/avassert.h" |
22 | | #include "libavutil/mem.h" |
23 | | #include "libavutil/mem_internal.h" |
24 | | |
25 | | #include "ops.h" |
26 | | #include "ops_internal.h" |
27 | | #include "ops_dispatch.h" |
28 | | |
29 | | typedef struct SwsOpPass { |
30 | | SwsCompiledOp comp; |
31 | | SwsOpExec exec_base; |
32 | | int num_blocks; |
33 | | int tail_off_in; |
34 | | int tail_off_out; |
35 | | int tail_size_in; |
36 | | int tail_size_out; |
37 | | int planes_in; |
38 | | int planes_out; |
39 | | int pixel_bits_in; |
40 | | int pixel_bits_out; |
41 | | int idx_in[4]; |
42 | | int idx_out[4]; |
43 | | bool memcpy_in; |
44 | | bool memcpy_out; |
45 | | } SwsOpPass; |
46 | | |
47 | | int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, |
48 | | const SwsOpList *ops, SwsCompiledOp *out) |
49 | 0 | { |
50 | 0 | SwsOpList *copy; |
51 | 0 | SwsCompiledOp compiled = {0}; |
52 | 0 | int ret = 0; |
53 | |
|
54 | 0 | copy = ff_sws_op_list_duplicate(ops); |
55 | 0 | if (!copy) |
56 | 0 | return AVERROR(ENOMEM); |
57 | | |
58 | | /* Ensure these are always set during compilation */ |
59 | 0 | ff_sws_op_list_update_comps(copy); |
60 | |
|
61 | 0 | ret = backend->compile(ctx, copy, &compiled); |
62 | 0 | if (ret < 0) { |
63 | 0 | int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR; |
64 | 0 | av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n", |
65 | 0 | backend->name, av_err2str(ret)); |
66 | 0 | } else { |
67 | 0 | *out = compiled; |
68 | 0 | } |
69 | |
|
70 | 0 | ff_sws_op_list_free(©); |
71 | 0 | return ret; |
72 | 0 | } |
73 | | |
74 | | int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) |
75 | 0 | { |
76 | 0 | for (int n = 0; ff_sws_op_backends[n]; n++) { |
77 | 0 | const SwsOpBackend *backend = ff_sws_op_backends[n]; |
78 | 0 | if (ops->src.hw_format != backend->hw_format || |
79 | 0 | ops->dst.hw_format != backend->hw_format) |
80 | 0 | continue; |
81 | 0 | if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) |
82 | 0 | continue; |
83 | | |
84 | 0 | av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " |
85 | 0 | "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n", |
86 | 0 | backend->name, out->block_size, out->over_read, out->over_write, |
87 | 0 | out->cpu_flags); |
88 | |
|
89 | 0 | ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops); |
90 | 0 | return 0; |
91 | 0 | } |
92 | | |
93 | 0 | av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); |
94 | 0 | ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops); |
95 | 0 | return AVERROR(ENOTSUP); |
96 | 0 | } |
97 | | |
98 | | static void op_pass_free(void *ptr) |
99 | 0 | { |
100 | 0 | SwsOpPass *p = ptr; |
101 | 0 | if (!p) |
102 | 0 | return; |
103 | | |
104 | 0 | if (p->comp.free) |
105 | 0 | p->comp.free(p->comp.priv); |
106 | |
|
107 | 0 | av_free(p); |
108 | 0 | } |
109 | | |
110 | | static inline void get_row_data(const SwsOpPass *p, const int y, |
111 | | const uint8_t *in[4], uint8_t *out[4]) |
112 | 0 | { |
113 | 0 | const SwsOpExec *base = &p->exec_base; |
114 | 0 | for (int i = 0; i < p->planes_in; i++) |
115 | 0 | in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i]; |
116 | 0 | for (int i = 0; i < p->planes_out; i++) |
117 | 0 | out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i]; |
118 | 0 | } |
119 | | |
120 | | static void op_pass_setup(const SwsFrame *out, const SwsFrame *in, |
121 | | const SwsPass *pass) |
122 | 0 | { |
123 | 0 | const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format); |
124 | 0 | const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format); |
125 | |
|
126 | 0 | SwsOpPass *p = pass->priv; |
127 | 0 | SwsOpExec *exec = &p->exec_base; |
128 | 0 | const SwsCompiledOp *comp = &p->comp; |
129 | 0 | const int block_size = comp->block_size; |
130 | 0 | p->num_blocks = (pass->width + block_size - 1) / block_size; |
131 | | |
132 | | /* Set up main loop parameters */ |
133 | 0 | const int aligned_w = p->num_blocks * block_size; |
134 | 0 | const int safe_width = (p->num_blocks - 1) * block_size; |
135 | 0 | const int tail_size = pass->width - safe_width; |
136 | 0 | p->tail_off_in = safe_width * p->pixel_bits_in >> 3; |
137 | 0 | p->tail_off_out = safe_width * p->pixel_bits_out >> 3; |
138 | 0 | p->tail_size_in = tail_size * p->pixel_bits_in >> 3; |
139 | 0 | p->tail_size_out = tail_size * p->pixel_bits_out >> 3; |
140 | 0 | p->memcpy_in = false; |
141 | 0 | p->memcpy_out = false; |
142 | |
|
143 | 0 | for (int i = 0; i < p->planes_in; i++) { |
144 | 0 | const int idx = p->idx_in[i]; |
145 | 0 | const int chroma = idx == 1 || idx == 2; |
146 | 0 | const int sub_x = chroma ? indesc->log2_chroma_w : 0; |
147 | 0 | const int sub_y = chroma ? indesc->log2_chroma_h : 0; |
148 | 0 | const int plane_w = (aligned_w + sub_x) >> sub_x; |
149 | 0 | const int plane_pad = (comp->over_read + sub_x) >> sub_x; |
150 | 0 | const int plane_size = plane_w * p->pixel_bits_in >> 3; |
151 | 0 | if (comp->slice_align) |
152 | 0 | p->memcpy_in |= plane_size + plane_pad > in->linesize[idx]; |
153 | 0 | exec->in[i] = in->data[idx]; |
154 | 0 | exec->in_stride[i] = in->linesize[idx]; |
155 | 0 | exec->in_sub_y[i] = sub_y; |
156 | 0 | exec->in_sub_x[i] = sub_x; |
157 | 0 | } |
158 | |
|
159 | 0 | for (int i = 0; i < p->planes_out; i++) { |
160 | 0 | const int idx = p->idx_out[i]; |
161 | 0 | const int chroma = idx == 1 || idx == 2; |
162 | 0 | const int sub_x = chroma ? outdesc->log2_chroma_w : 0; |
163 | 0 | const int sub_y = chroma ? outdesc->log2_chroma_h : 0; |
164 | 0 | const int plane_w = (aligned_w + sub_x) >> sub_x; |
165 | 0 | const int plane_pad = (comp->over_write + sub_x) >> sub_x; |
166 | 0 | const int plane_size = plane_w * p->pixel_bits_out >> 3; |
167 | 0 | if (comp->slice_align) |
168 | 0 | p->memcpy_out |= plane_size + plane_pad > out->linesize[idx]; |
169 | 0 | exec->out[i] = out->data[idx]; |
170 | 0 | exec->out_stride[i] = out->linesize[idx]; |
171 | 0 | exec->out_sub_y[i] = sub_y; |
172 | 0 | exec->out_sub_x[i] = sub_x; |
173 | 0 | } |
174 | | |
175 | | /* Pre-fill pointer bump for the main section only; this value does not |
176 | | * matter at all for the tail / last row handlers because they only ever |
177 | | * process a single line */ |
178 | 0 | const int blocks_main = p->num_blocks - p->memcpy_out; |
179 | 0 | for (int i = 0; i < 4; i++) { |
180 | 0 | exec->in_bump[i] = exec->in_stride[i] - blocks_main * exec->block_size_in; |
181 | 0 | exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out; |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | | /* Dispatch kernel over the last column of the image using memcpy */ |
186 | | static av_always_inline void |
187 | | handle_tail(const SwsOpPass *p, SwsOpExec *exec, |
188 | | const bool copy_out, const bool copy_in, |
189 | | int y, const int h) |
190 | 0 | { |
191 | 0 | DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])]; |
192 | |
|
193 | 0 | const SwsOpExec *base = &p->exec_base; |
194 | 0 | const SwsCompiledOp *comp = &p->comp; |
195 | 0 | const int tail_size_in = p->tail_size_in; |
196 | 0 | const int tail_size_out = p->tail_size_out; |
197 | 0 | const int bx = p->num_blocks - 1; |
198 | |
|
199 | 0 | const uint8_t *in_data[4]; |
200 | 0 | uint8_t *out_data[4]; |
201 | 0 | get_row_data(p, y, in_data, out_data); |
202 | |
|
203 | 0 | for (int i = 0; i < p->planes_in; i++) { |
204 | 0 | in_data[i] += p->tail_off_in; |
205 | 0 | if (copy_in) { |
206 | 0 | exec->in[i] = (void *) tmp[0][i]; |
207 | 0 | exec->in_stride[i] = sizeof(tmp[0][i]); |
208 | 0 | } else { |
209 | 0 | exec->in[i] = in_data[i]; |
210 | 0 | } |
211 | 0 | } |
212 | |
|
213 | 0 | for (int i = 0; i < p->planes_out; i++) { |
214 | 0 | out_data[i] += p->tail_off_out; |
215 | 0 | if (copy_out) { |
216 | 0 | exec->out[i] = (void *) tmp[1][i]; |
217 | 0 | exec->out_stride[i] = sizeof(tmp[1][i]); |
218 | 0 | } else { |
219 | 0 | exec->out[i] = out_data[i]; |
220 | 0 | } |
221 | 0 | } |
222 | |
|
223 | 0 | for (int y_end = y + h; y < y_end; y++) { |
224 | 0 | if (copy_in) { |
225 | 0 | for (int i = 0; i < p->planes_in; i++) { |
226 | 0 | av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]); |
227 | 0 | memcpy(tmp[0][i], in_data[i], tail_size_in); |
228 | 0 | in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */ |
229 | 0 | } |
230 | 0 | } |
231 | |
|
232 | 0 | comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1); |
233 | |
|
234 | 0 | if (copy_out) { |
235 | 0 | for (int i = 0; i < p->planes_out; i++) { |
236 | 0 | av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]); |
237 | 0 | memcpy(out_data[i], tmp[1][i], tail_size_out); |
238 | 0 | out_data[i] += base->out_stride[i]; |
239 | 0 | } |
240 | 0 | } |
241 | |
|
242 | 0 | for (int i = 0; i < 4; i++) { |
243 | 0 | if (!copy_in && exec->in[i]) |
244 | 0 | exec->in[i] += exec->in_stride[i]; |
245 | 0 | if (!copy_out && exec->out[i]) |
246 | 0 | exec->out[i] += exec->out_stride[i]; |
247 | 0 | } |
248 | 0 | } |
249 | 0 | } |
250 | | |
251 | | static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y, |
252 | | const int h, const SwsPass *pass) |
253 | 0 | { |
254 | 0 | const SwsOpPass *p = pass->priv; |
255 | 0 | const SwsCompiledOp *comp = &p->comp; |
256 | | |
257 | | /* Fill exec metadata for this slice */ |
258 | 0 | DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base; |
259 | 0 | exec.slice_y = y; |
260 | 0 | exec.slice_h = h; |
261 | | |
262 | | /** |
263 | | * To ensure safety, we need to consider the following: |
264 | | * |
265 | | * 1. We can overread the input, unless this is the last line of an |
266 | | * unpadded buffer. All defined operations can handle arbitrary pixel |
267 | | * input, so overread of arbitrary data is fine. |
268 | | * |
269 | | * 2. We can overwrite the output, as long as we don't write more than the |
270 | | * amount of pixels that fit into one linesize. So we always need to |
271 | | * memcpy the last column on the output side if unpadded. |
272 | | * |
273 | | * 3. For the last row, we also need to memcpy the remainder of the input, |
274 | | * to avoid reading past the end of the buffer. Note that since we know |
275 | | * the run() function is called on stripes of the same buffer, we don't |
276 | | * need to worry about this for the end of a slice. |
277 | | */ |
278 | |
|
279 | 0 | const int last_slice = y + h == pass->height; |
280 | 0 | const bool memcpy_in = last_slice && p->memcpy_in; |
281 | 0 | const bool memcpy_out = p->memcpy_out; |
282 | 0 | const int num_blocks = p->num_blocks; |
283 | 0 | const int blocks_main = num_blocks - memcpy_out; |
284 | 0 | const int h_main = h - memcpy_in; |
285 | | |
286 | | /* Handle main section */ |
287 | 0 | get_row_data(p, y, exec.in, exec.out); |
288 | 0 | comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main); |
289 | |
|
290 | 0 | if (memcpy_in) { |
291 | | /* Safe part of last row */ |
292 | 0 | get_row_data(p, y + h_main, exec.in, exec.out); |
293 | 0 | comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h); |
294 | 0 | } |
295 | | |
296 | | /* Handle last column via memcpy, takes over `exec` so call these last */ |
297 | 0 | if (memcpy_out) |
298 | 0 | handle_tail(p, &exec, true, false, y, h_main); |
299 | 0 | if (memcpy_in) |
300 | 0 | handle_tail(p, &exec, memcpy_out, true, y + h_main, 1); |
301 | 0 | } |
302 | | |
303 | | static int rw_planes(const SwsOp *op) |
304 | 0 | { |
305 | 0 | return op->rw.packed ? 1 : op->rw.elems; |
306 | 0 | } |
307 | | |
308 | | static int rw_pixel_bits(const SwsOp *op) |
309 | 0 | { |
310 | 0 | const int elems = op->rw.packed ? op->rw.elems : 1; |
311 | 0 | const int size = ff_sws_pixel_type_size(op->type); |
312 | 0 | const int bits = 8 >> op->rw.frac; |
313 | 0 | av_assert1(bits >= 1); |
314 | 0 | return elems * size * bits; |
315 | 0 | } |
316 | | |
317 | | static int compile(SwsGraph *graph, const SwsOpList *ops, |
318 | | const SwsFormat *dst, SwsPass *input, SwsPass **output) |
319 | 0 | { |
320 | 0 | SwsContext *ctx = graph->ctx; |
321 | 0 | SwsOpPass *p = av_mallocz(sizeof(*p)); |
322 | 0 | if (!p) |
323 | 0 | return AVERROR(ENOMEM); |
324 | | |
325 | 0 | int ret = ff_sws_ops_compile(ctx, ops, &p->comp); |
326 | 0 | if (ret < 0) |
327 | 0 | goto fail; |
328 | | |
329 | 0 | if (p->comp.opaque) { |
330 | 0 | SwsCompiledOp c = p->comp; |
331 | 0 | av_free(p); |
332 | 0 | return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, |
333 | 0 | input, c.slice_align, c.func_opaque, |
334 | 0 | NULL, c.priv, c.free, output); |
335 | 0 | } |
336 | | |
337 | 0 | const SwsOp *read = ff_sws_op_list_input(ops); |
338 | 0 | const SwsOp *write = ff_sws_op_list_output(ops); |
339 | 0 | p->planes_in = rw_planes(read); |
340 | 0 | p->planes_out = rw_planes(write); |
341 | 0 | p->pixel_bits_in = rw_pixel_bits(read); |
342 | 0 | p->pixel_bits_out = rw_pixel_bits(write); |
343 | 0 | p->exec_base = (SwsOpExec) { |
344 | 0 | .width = dst->width, |
345 | 0 | .height = dst->height, |
346 | 0 | .block_size_in = p->comp.block_size * p->pixel_bits_in >> 3, |
347 | 0 | .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3, |
348 | 0 | }; |
349 | |
|
350 | 0 | for (int i = 0; i < 4; i++) { |
351 | 0 | p->idx_in[i] = i < p->planes_in ? ops->order_src.in[i] : -1; |
352 | 0 | p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1; |
353 | 0 | } |
354 | |
|
355 | 0 | return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, |
356 | 0 | input, p->comp.slice_align, op_pass_run, |
357 | 0 | op_pass_setup, p, op_pass_free, output); |
358 | | |
359 | 0 | fail: |
360 | 0 | op_pass_free(p); |
361 | 0 | return ret; |
362 | 0 | } |
363 | | |
364 | | int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, |
365 | | const SwsFormat *dst, SwsPass *input, SwsPass **output) |
366 | 0 | { |
367 | 0 | SwsContext *ctx = graph->ctx; |
368 | 0 | int ret; |
369 | | |
370 | | /* Check if the whole operation graph is an end-to-end no-op */ |
371 | 0 | if (ff_sws_op_list_is_noop(ops)) { |
372 | 0 | *output = input; |
373 | 0 | return 0; |
374 | 0 | } |
375 | | |
376 | 0 | const SwsOp *read = ff_sws_op_list_input(ops); |
377 | 0 | const SwsOp *write = ff_sws_op_list_output(ops); |
378 | 0 | if (!read || !write) { |
379 | 0 | av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " |
380 | 0 | "and write, respectively.\n"); |
381 | 0 | return AVERROR(EINVAL); |
382 | 0 | } |
383 | | |
384 | 0 | if (flags & SWS_OP_FLAG_OPTIMIZE) { |
385 | 0 | ret = ff_sws_op_list_optimize(ops); |
386 | 0 | if (ret < 0) |
387 | 0 | return ret; |
388 | 0 | } |
389 | | |
390 | 0 | return compile(graph, ops, dst, input, output); |
391 | 0 | } |