/src/ffmpeg/libswscale/ops_dispatch.c

Source
/**
 * Copyright (C) 2025 Niklas Haas
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"

#include "ops.h"
#include "ops_internal.h"
#include "ops_dispatch.h"

typedef struct SwsOpPass {
    SwsCompiledOp comp;
    SwsOpExec exec_base;
    int num_blocks;
    int tail_off_in;
    int tail_off_out;
    int tail_size_in;
    int tail_size_out;
    int planes_in;
    int planes_out;
    int pixel_bits_in;
    int pixel_bits_out;
    int idx_in[4];
    int idx_out[4];
    bool memcpy_in;
    bool memcpy_out;
} SwsOpPass;

int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
                               const SwsOpList *ops, SwsCompiledOp *out)
{
    SwsOpList *copy;
    SwsCompiledOp compiled = {0};
    int ret = 0;

    copy = ff_sws_op_list_duplicate(ops);
    if (!copy)
        return AVERROR(ENOMEM);

    /* Ensure these are always set during compilation */
    ff_sws_op_list_update_comps(copy);

    ret = backend->compile(ctx, copy, &compiled);
    if (ret < 0) {
        int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
        av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
               backend->name, av_err2str(ret));
    } else {
        *out = compiled;
    }

    ff_sws_op_list_free(&copy);
    return ret;
}

int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
{
    for (int n = 0; ff_sws_op_backends[n]; n++) {
        const SwsOpBackend *backend = ff_sws_op_backends[n];
        if (ops->src.hw_format != backend->hw_format ||
            ops->dst.hw_format != backend->hw_format)
            continue;
        if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
            continue;

        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
               "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
               backend->name, out->block_size, out->over_read, out->over_write,
               out->cpu_flags);

        ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops);
        return 0;
    }

    av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
    ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
    return AVERROR(ENOTSUP);
}

static void op_pass_free(void *ptr)
{
    SwsOpPass *p = ptr;
    if (!p)
        return;

    if (p->comp.free)
        p->comp.free(p->comp.priv);

    av_free(p);
}

static inline void get_row_data(const SwsOpPass *p, const int y,
                                const uint8_t *in[4], uint8_t *out[4])
{
    const SwsOpExec *base = &p->exec_base;
    for (int i = 0; i < p->planes_in; i++)
        in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i];
    for (int i = 0; i < p->planes_out; i++)
        out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i];
}

static void op_pass_setup(const SwsFrame *out, const SwsFrame *in,
                          const SwsPass *pass)
{
    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->format);
    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);

    SwsOpPass *p = pass->priv;
    SwsOpExec *exec = &p->exec_base;
    const SwsCompiledOp *comp = &p->comp;
    const int block_size = comp->block_size;
    p->num_blocks = (pass->width + block_size - 1) / block_size;

    /* Set up main loop parameters */
    const int aligned_w  = p->num_blocks * block_size;
    const int safe_width = (p->num_blocks - 1) * block_size;
    const int tail_size  = pass->width - safe_width;
    p->tail_off_in   = safe_width * p->pixel_bits_in  >> 3;
    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
    p->tail_size_in  = tail_size  * p->pixel_bits_in  >> 3;
    p->tail_size_out = tail_size  * p->pixel_bits_out >> 3;
    p->memcpy_in     = false;
    p->memcpy_out    = false;

    for (int i = 0; i < p->planes_in; i++) {
        const int idx        = p->idx_in[i];
        const int chroma     = idx == 1 || idx == 2;
        const int sub_x      = chroma ? indesc->log2_chroma_w : 0;
        const int sub_y      = chroma ? indesc->log2_chroma_h : 0;
        const int plane_w    = (aligned_w + sub_x) >> sub_x;
        const int plane_pad  = (comp->over_read + sub_x) >> sub_x;
        const int plane_size = plane_w * p->pixel_bits_in >> 3;
        if (comp->slice_align)
            p->memcpy_in |= plane_size + plane_pad > in->linesize[idx];
        exec->in[i]        = in->data[idx];
        exec->in_stride[i] = in->linesize[idx];
        exec->in_sub_y[i]  = sub_y;
        exec->in_sub_x[i]  = sub_x;
    }

    for (int i = 0; i < p->planes_out; i++) {
        const int idx        = p->idx_out[i];
        const int chroma     = idx == 1 || idx == 2;
        const int sub_x      = chroma ? outdesc->log2_chroma_w : 0;
        const int sub_y      = chroma ? outdesc->log2_chroma_h : 0;
        const int plane_w    = (aligned_w + sub_x) >> sub_x;
        const int plane_pad  = (comp->over_write + sub_x) >> sub_x;
        const int plane_size = plane_w * p->pixel_bits_out >> 3;
        if (comp->slice_align)
            p->memcpy_out |= plane_size + plane_pad > out->linesize[idx];
        exec->out[i]        = out->data[idx];
        exec->out_stride[i] = out->linesize[idx];
        exec->out_sub_y[i]  = sub_y;
        exec->out_sub_x[i]  = sub_x;
    }

    /* Pre-fill pointer bump for the main section only; this value does not
     * matter at all for the tail / last row handlers because they only ever
     * process a single line */
    const int blocks_main = p->num_blocks - p->memcpy_out;
    for (int i = 0; i < 4; i++) {
        exec->in_bump[i]  = exec->in_stride[i]  - blocks_main * exec->block_size_in;
        exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out;
    }
}

/* Dispatch kernel over the last column of the image using memcpy */
static av_always_inline void
handle_tail(const SwsOpPass *p, SwsOpExec *exec,
            const bool copy_out, const bool copy_in,
            int y, const int h)
{
    DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];

    const SwsOpExec *base = &p->exec_base;
    const SwsCompiledOp *comp = &p->comp;
    const int tail_size_in  = p->tail_size_in;
    const int tail_size_out = p->tail_size_out;
    const int bx = p->num_blocks - 1;

    const uint8_t *in_data[4];
    uint8_t *out_data[4];
    get_row_data(p, y, in_data, out_data);

    for (int i = 0; i < p->planes_in; i++) {
        in_data[i] += p->tail_off_in;
        if (copy_in) {
            exec->in[i] = (void *) tmp[0][i];
            exec->in_stride[i] = sizeof(tmp[0][i]);
        } else {
            exec->in[i] = in_data[i];
        }
    }

    for (int i = 0; i < p->planes_out; i++) {
        out_data[i] += p->tail_off_out;
        if (copy_out) {
            exec->out[i] = (void *) tmp[1][i];
            exec->out_stride[i] = sizeof(tmp[1][i]);
        } else {
            exec->out[i] = out_data[i];
        }
    }

    for (int y_end = y + h; y < y_end; y++) {
        if (copy_in) {
            for (int i = 0; i < p->planes_in; i++) {
                av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
                memcpy(tmp[0][i], in_data[i], tail_size_in);
                in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */
            }
        }

        comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);

        if (copy_out) {
            for (int i = 0; i < p->planes_out; i++) {
                av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
                memcpy(out_data[i], tmp[1][i], tail_size_out);
                out_data[i] += base->out_stride[i];
            }
        }

        for (int i = 0; i < 4; i++) {
            if (!copy_in && exec->in[i])
                exec->in[i] += exec->in_stride[i];
            if (!copy_out && exec->out[i])
                exec->out[i] += exec->out_stride[i];
        }
    }
}

static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y,
                        const int h, const SwsPass *pass)
{
    const SwsOpPass *p = pass->priv;
    const SwsCompiledOp *comp = &p->comp;

    /* Fill exec metadata for this slice */
    DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
    exec.slice_y = y;
    exec.slice_h = h;

    /**
     *  To ensure safety, we need to consider the following:
     *
     * 1. We can overread the input, unless this is the last line of an
     *    unpadded buffer. All defined operations can handle arbitrary pixel
     *    input, so overread of arbitrary data is fine.
     *
     * 2. We can overwrite the output, as long as we don't write more than the
     *    amount of pixels that fit into one linesize. So we always need to
     *    memcpy the last column on the output side if unpadded.
     *
     * 3. For the last row, we also need to memcpy the remainder of the input,
     *    to avoid reading past the end of the buffer. Note that since we know
     *    the run() function is called on stripes of the same buffer, we don't
     *    need to worry about this for the end of a slice.
     */

    const int last_slice  = y + h == pass->height;
    const bool memcpy_in  = last_slice && p->memcpy_in;
    const bool memcpy_out = p->memcpy_out;
    const int num_blocks  = p->num_blocks;
    const int blocks_main = num_blocks - memcpy_out;
    const int h_main      = h - memcpy_in;

    /* Handle main section */
    get_row_data(p, y, exec.in, exec.out);
    comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);

    if (memcpy_in) {
        /* Safe part of last row */
        get_row_data(p, y + h_main, exec.in, exec.out);
        comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
    }

    /* Handle last column via memcpy, takes over `exec` so call these last */
    if (memcpy_out)
        handle_tail(p, &exec, true, false, y, h_main);
    if (memcpy_in)
        handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
}

static int rw_planes(const SwsOp *op)
{
    return op->rw.packed ? 1 : op->rw.elems;
}

static int rw_pixel_bits(const SwsOp *op)
{
    const int elems = op->rw.packed ? op->rw.elems : 1;
    const int size  = ff_sws_pixel_type_size(op->type);
    const int bits  = 8 >> op->rw.frac;
    av_assert1(bits >= 1);
    return elems * size * bits;
}

static int compile(SwsGraph *graph, const SwsOpList *ops,
                   const SwsFormat *dst, SwsPass *input, SwsPass **output)
{
    SwsContext *ctx = graph->ctx;
    SwsOpPass *p = av_mallocz(sizeof(*p));
    if (!p)
        return AVERROR(ENOMEM);

    int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
    if (ret < 0)
        goto fail;

    if (p->comp.opaque) {
        SwsCompiledOp c = p->comp;
        av_free(p);
        return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
                                     input, c.slice_align, c.func_opaque,
                                     NULL, c.priv, c.free, output);
    }

    const SwsOp *read  = ff_sws_op_list_input(ops);
    const SwsOp *write = ff_sws_op_list_output(ops);
    p->planes_in  = rw_planes(read);
    p->planes_out = rw_planes(write);
    p->pixel_bits_in  = rw_pixel_bits(read);
    p->pixel_bits_out = rw_pixel_bits(write);
    p->exec_base = (SwsOpExec) {
        .width  = dst->width,
        .height = dst->height,
        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
    };

    for (int i = 0; i < 4; i++) {
        p->idx_in[i]  = i < p->planes_in  ? ops->order_src.in[i] : -1;
        p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
    }

    return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
                                 input, p->comp.slice_align, op_pass_run,
                                 op_pass_setup, p, op_pass_free, output);

fail:
    op_pass_free(p);
    return ret;
}

int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags,
                        const SwsFormat *dst, SwsPass *input, SwsPass **output)
{
    SwsContext *ctx = graph->ctx;
    int ret;

    /* Check if the whole operation graph is an end-to-end no-op */
    if (ff_sws_op_list_is_noop(ops)) {
        *output = input;
        return 0;
    }

    const SwsOp *read  = ff_sws_op_list_input(ops);
    const SwsOp *write = ff_sws_op_list_output(ops);
    if (!read || !write) {
        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
               "and write, respectively.\n");
        return AVERROR(EINVAL);
    }

    if (flags & SWS_OP_FLAG_OPTIMIZE) {
        ret = ff_sws_op_list_optimize(ops);
        if (ret < 0)
            return ret;
    }

    return compile(graph, ops, dst, input, output);
}

Coverage Report

Created: 2026-03-12 07:14

Line	Count	Source
1		/**
2		* Copyright (C) 2025 Niklas Haas
3		*
4		* This file is part of FFmpeg.
5		*
6		* FFmpeg is free software; you can redistribute it and/or
7		* modify it under the terms of the GNU Lesser General Public
8		* License as published by the Free Software Foundation; either
9		* version 2.1 of the License, or (at your option) any later version.
10		*
11		* FFmpeg is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14		* Lesser General Public License for more details.
15		*
16		* You should have received a copy of the GNU Lesser General Public
17		* License along with FFmpeg; if not, write to the Free Software
18		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19		*/
20
21		#include "libavutil/avassert.h"
22		#include "libavutil/mem.h"
23		#include "libavutil/mem_internal.h"
24
25		#include "ops.h"
26		#include "ops_internal.h"
27		#include "ops_dispatch.h"
28
29		typedef struct SwsOpPass {
30		SwsCompiledOp comp;
31		SwsOpExec exec_base;
32		int num_blocks;
33		int tail_off_in;
34		int tail_off_out;
35		int tail_size_in;
36		int tail_size_out;
37		int planes_in;
38		int planes_out;
39		int pixel_bits_in;
40		int pixel_bits_out;
41		int idx_in[4];
42		int idx_out[4];
43		bool memcpy_in;
44		bool memcpy_out;
45		} SwsOpPass;
46
47		int ff_sws_ops_compile_backend(SwsContext ctx, const SwsOpBackend backend,
48		const SwsOpList ops, SwsCompiledOp out)
49	0	{
50	0	SwsOpList *copy;
51	0	SwsCompiledOp compiled = {0};
52	0	int ret = 0;
53
54	0	copy = ff_sws_op_list_duplicate(ops);
55	0	if (!copy)
56	0	return AVERROR(ENOMEM);
57
58		/* Ensure these are always set during compilation */
59	0	ff_sws_op_list_update_comps(copy);
60
61	0	ret = backend->compile(ctx, copy, &compiled);
62	0	if (ret < 0) {
63	0	int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
64	0	av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
65	0	backend->name, av_err2str(ret));
66	0	} else {
67	0	*out = compiled;
68	0	}
69
70	0	ff_sws_op_list_free(&copy);
71	0	return ret;
72	0	}
73
74		int ff_sws_ops_compile(SwsContext ctx, const SwsOpList ops, SwsCompiledOp *out)
75	0	{
76	0	for (int n = 0; ff_sws_op_backends[n]; n++) {
77	0	const SwsOpBackend *backend = ff_sws_op_backends[n];
78	0	if (ops->src.hw_format != backend->hw_format \|\|
79	0	ops->dst.hw_format != backend->hw_format)
80	0	continue;
81	0	if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
82	0	continue;
83
84	0	av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
85	0	"block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
86	0	backend->name, out->block_size, out->over_read, out->over_write,
87	0	out->cpu_flags);
88
89	0	ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops);
90	0	return 0;
91	0	}
92
93	0	av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
94	0	ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
95	0	return AVERROR(ENOTSUP);
96	0	}
97
98		static void op_pass_free(void *ptr)
99	0	{
100	0	SwsOpPass *p = ptr;
101	0	if (!p)
102	0	return;
103
104	0	if (p->comp.free)
105	0	p->comp.free(p->comp.priv);
106
107	0	av_free(p);
108	0	}
109
110		static inline void get_row_data(const SwsOpPass *p, const int y,
111		const uint8_t in[4], uint8_t out[4])
112	0	{
113	0	const SwsOpExec *base = &p->exec_base;
114	0	for (int i = 0; i < p->planes_in; i++)
115	0	in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i];
116	0	for (int i = 0; i < p->planes_out; i++)
117	0	out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i];
118	0	}
119
120		static void op_pass_setup(const SwsFrame out, const SwsFrame in,
121		const SwsPass *pass)
122	0	{
123	0	const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format);
124	0	const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);
125
126	0	SwsOpPass *p = pass->priv;
127	0	SwsOpExec *exec = &p->exec_base;
128	0	const SwsCompiledOp *comp = &p->comp;
129	0	const int block_size = comp->block_size;
130	0	p->num_blocks = (pass->width + block_size - 1) / block_size;
131
132		/* Set up main loop parameters */
133	0	const int aligned_w = p->num_blocks * block_size;
134	0	const int safe_width = (p->num_blocks - 1) * block_size;
135	0	const int tail_size = pass->width - safe_width;
136	0	p->tail_off_in = safe_width * p->pixel_bits_in >> 3;
137	0	p->tail_off_out = safe_width * p->pixel_bits_out >> 3;
138	0	p->tail_size_in = tail_size * p->pixel_bits_in >> 3;
139	0	p->tail_size_out = tail_size * p->pixel_bits_out >> 3;
140	0	p->memcpy_in = false;
141	0	p->memcpy_out = false;
142
143	0	for (int i = 0; i < p->planes_in; i++) {
144	0	const int idx = p->idx_in[i];
145	0	const int chroma = idx == 1 \|\| idx == 2;
146	0	const int sub_x = chroma ? indesc->log2_chroma_w : 0;
147	0	const int sub_y = chroma ? indesc->log2_chroma_h : 0;
148	0	const int plane_w = (aligned_w + sub_x) >> sub_x;
149	0	const int plane_pad = (comp->over_read + sub_x) >> sub_x;
150	0	const int plane_size = plane_w * p->pixel_bits_in >> 3;
151	0	if (comp->slice_align)
152	0	p->memcpy_in \|= plane_size + plane_pad > in->linesize[idx];
153	0	exec->in[i] = in->data[idx];
154	0	exec->in_stride[i] = in->linesize[idx];
155	0	exec->in_sub_y[i] = sub_y;
156	0	exec->in_sub_x[i] = sub_x;
157	0	}
158
159	0	for (int i = 0; i < p->planes_out; i++) {
160	0	const int idx = p->idx_out[i];
161	0	const int chroma = idx == 1 \|\| idx == 2;
162	0	const int sub_x = chroma ? outdesc->log2_chroma_w : 0;
163	0	const int sub_y = chroma ? outdesc->log2_chroma_h : 0;
164	0	const int plane_w = (aligned_w + sub_x) >> sub_x;
165	0	const int plane_pad = (comp->over_write + sub_x) >> sub_x;
166	0	const int plane_size = plane_w * p->pixel_bits_out >> 3;
167	0	if (comp->slice_align)
168	0	p->memcpy_out \|= plane_size + plane_pad > out->linesize[idx];
169	0	exec->out[i] = out->data[idx];
170	0	exec->out_stride[i] = out->linesize[idx];
171	0	exec->out_sub_y[i] = sub_y;
172	0	exec->out_sub_x[i] = sub_x;
173	0	}
174
175		/* Pre-fill pointer bump for the main section only; this value does not
176		* matter at all for the tail / last row handlers because they only ever
177		* process a single line */
178	0	const int blocks_main = p->num_blocks - p->memcpy_out;
179	0	for (int i = 0; i < 4; i++) {
180	0	exec->in_bump[i] = exec->in_stride[i] - blocks_main * exec->block_size_in;
181	0	exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out;
182	0	}
183	0	}
184
185		/* Dispatch kernel over the last column of the image using memcpy */
186		static av_always_inline void
187		handle_tail(const SwsOpPass p, SwsOpExec exec,
188		const bool copy_out, const bool copy_in,
189		int y, const int h)
190	0	{
191	0	DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
192
193	0	const SwsOpExec *base = &p->exec_base;
194	0	const SwsCompiledOp *comp = &p->comp;
195	0	const int tail_size_in = p->tail_size_in;
196	0	const int tail_size_out = p->tail_size_out;
197	0	const int bx = p->num_blocks - 1;
198
199	0	const uint8_t *in_data[4];
200	0	uint8_t *out_data[4];
201	0	get_row_data(p, y, in_data, out_data);
202
203	0	for (int i = 0; i < p->planes_in; i++) {
204	0	in_data[i] += p->tail_off_in;
205	0	if (copy_in) {
206	0	exec->in[i] = (void *) tmp[0][i];
207	0	exec->in_stride[i] = sizeof(tmp[0][i]);
208	0	} else {
209	0	exec->in[i] = in_data[i];
210	0	}
211	0	}
212
213	0	for (int i = 0; i < p->planes_out; i++) {
214	0	out_data[i] += p->tail_off_out;
215	0	if (copy_out) {
216	0	exec->out[i] = (void *) tmp[1][i];
217	0	exec->out_stride[i] = sizeof(tmp[1][i]);
218	0	} else {
219	0	exec->out[i] = out_data[i];
220	0	}
221	0	}
222
223	0	for (int y_end = y + h; y < y_end; y++) {
224	0	if (copy_in) {
225	0	for (int i = 0; i < p->planes_in; i++) {
226	0	av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
227	0	memcpy(tmp[0][i], in_data[i], tail_size_in);
228	0	in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */
229	0	}
230	0	}
231
232	0	comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);
233
234	0	if (copy_out) {
235	0	for (int i = 0; i < p->planes_out; i++) {
236	0	av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
237	0	memcpy(out_data[i], tmp[1][i], tail_size_out);
238	0	out_data[i] += base->out_stride[i];
239	0	}
240	0	}
241
242	0	for (int i = 0; i < 4; i++) {
243	0	if (!copy_in && exec->in[i])
244	0	exec->in[i] += exec->in_stride[i];
245	0	if (!copy_out && exec->out[i])
246	0	exec->out[i] += exec->out_stride[i];
247	0	}
248	0	}
249	0	}
250
251		static void op_pass_run(const SwsFrame out, const SwsFrame in, const int y,
252		const int h, const SwsPass *pass)
253	0	{
254	0	const SwsOpPass *p = pass->priv;
255	0	const SwsCompiledOp *comp = &p->comp;
256
257		/* Fill exec metadata for this slice */
258	0	DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
259	0	exec.slice_y = y;
260	0	exec.slice_h = h;
261
262		/**
263		* To ensure safety, we need to consider the following:
264		*
265		* 1. We can overread the input, unless this is the last line of an
266		* unpadded buffer. All defined operations can handle arbitrary pixel
267		* input, so overread of arbitrary data is fine.
268		*
269		* 2. We can overwrite the output, as long as we don't write more than the
270		* amount of pixels that fit into one linesize. So we always need to
271		* memcpy the last column on the output side if unpadded.
272		*
273		* 3. For the last row, we also need to memcpy the remainder of the input,
274		* to avoid reading past the end of the buffer. Note that since we know
275		* the run() function is called on stripes of the same buffer, we don't
276		* need to worry about this for the end of a slice.
277		*/
278
279	0	const int last_slice = y + h == pass->height;
280	0	const bool memcpy_in = last_slice && p->memcpy_in;
281	0	const bool memcpy_out = p->memcpy_out;
282	0	const int num_blocks = p->num_blocks;
283	0	const int blocks_main = num_blocks - memcpy_out;
284	0	const int h_main = h - memcpy_in;
285
286		/* Handle main section */
287	0	get_row_data(p, y, exec.in, exec.out);
288	0	comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);
289
290	0	if (memcpy_in) {
291		/* Safe part of last row */
292	0	get_row_data(p, y + h_main, exec.in, exec.out);
293	0	comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
294	0	}
295
296		/* Handle last column via memcpy, takes over `exec` so call these last */
297	0	if (memcpy_out)
298	0	handle_tail(p, &exec, true, false, y, h_main);
299	0	if (memcpy_in)
300	0	handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
301	0	}
302
303		static int rw_planes(const SwsOp *op)
304	0	{
305	0	return op->rw.packed ? 1 : op->rw.elems;
306	0	}
307
308		static int rw_pixel_bits(const SwsOp *op)
309	0	{
310	0	const int elems = op->rw.packed ? op->rw.elems : 1;
311	0	const int size = ff_sws_pixel_type_size(op->type);
312	0	const int bits = 8 >> op->rw.frac;
313	0	av_assert1(bits >= 1);
314	0	return elems * size * bits;
315	0	}
316
317		static int compile(SwsGraph graph, const SwsOpList ops,
318		const SwsFormat dst, SwsPass input, SwsPass **output)
319	0	{
320	0	SwsContext *ctx = graph->ctx;
321	0	SwsOpPass p = av_mallocz(sizeof(p));
322	0	if (!p)
323	0	return AVERROR(ENOMEM);
324
325	0	int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
326	0	if (ret < 0)
327	0	goto fail;
328
329	0	if (p->comp.opaque) {
330	0	SwsCompiledOp c = p->comp;
331	0	av_free(p);
332	0	return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
333	0	input, c.slice_align, c.func_opaque,
334	0	NULL, c.priv, c.free, output);
335	0	}
336
337	0	const SwsOp *read = ff_sws_op_list_input(ops);
338	0	const SwsOp *write = ff_sws_op_list_output(ops);
339	0	p->planes_in = rw_planes(read);
340	0	p->planes_out = rw_planes(write);
341	0	p->pixel_bits_in = rw_pixel_bits(read);
342	0	p->pixel_bits_out = rw_pixel_bits(write);
343	0	p->exec_base = (SwsOpExec) {
344	0	.width = dst->width,
345	0	.height = dst->height,
346	0	.block_size_in = p->comp.block_size * p->pixel_bits_in >> 3,
347	0	.block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
348	0	};
349
350	0	for (int i = 0; i < 4; i++) {
351	0	p->idx_in[i] = i < p->planes_in ? ops->order_src.in[i] : -1;
352	0	p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
353	0	}
354
355	0	return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
356	0	input, p->comp.slice_align, op_pass_run,
357	0	op_pass_setup, p, op_pass_free, output);
358
359	0	fail:
360	0	op_pass_free(p);
361	0	return ret;
362	0	}
363
364		int ff_sws_compile_pass(SwsGraph graph, SwsOpList ops, int flags,
365		const SwsFormat dst, SwsPass input, SwsPass **output)
366	0	{
367	0	SwsContext *ctx = graph->ctx;
368	0	int ret;
369
370		/* Check if the whole operation graph is an end-to-end no-op */
371	0	if (ff_sws_op_list_is_noop(ops)) {
372	0	*output = input;
373	0	return 0;
374	0	}
375
376	0	const SwsOp *read = ff_sws_op_list_input(ops);
377	0	const SwsOp *write = ff_sws_op_list_output(ops);
378	0	if (!read \|\| !write) {
379	0	av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
380	0	"and write, respectively.\n");
381	0	return AVERROR(EINVAL);
382	0	}
383
384	0	if (flags & SWS_OP_FLAG_OPTIMIZE) {
385	0	ret = ff_sws_op_list_optimize(ops);
386	0	if (ret < 0)
387	0	return ret;
388	0	}
389
390	0	return compile(graph, ops, dst, input, output);
391	0	}