/src/ffmpeg/libswscale/ops_dispatch.c

Source
/**
 * Copyright (C) 2025 Niklas Haas
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
#include "libavutil/refstruct.h"

#include "ops.h"
#include "ops_internal.h"
#include "ops_dispatch.h"

typedef struct SwsOpPass {
    SwsCompiledOp comp;
    SwsOpExec exec_base;
    SwsOpExec exec_tail;
    int num_blocks;
    int tail_off_in;
    int tail_off_out;
    int tail_size_in;
    int tail_size_out;
    int planes_in;
    int planes_out;
    int pixel_bits_in;
    int pixel_bits_out;
    int idx_in[4];
    int idx_out[4];
    int *offsets_y;
    int filter_size;
    bool memcpy_first;
    bool memcpy_last;
    bool memcpy_out;
    uint8_t *tail_buf; /* extra memory for fixing unpadded tails */
    unsigned int tail_buf_size;
} SwsOpPass;

int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
                               const SwsOpList *ops, SwsCompiledOp *out)
{
    SwsOpList *copy;
    SwsCompiledOp compiled = {0};
    int ret = 0;

    copy = ff_sws_op_list_duplicate(ops);
    if (!copy)
        return AVERROR(ENOMEM);

    /* Ensure these are always set during compilation */
    ff_sws_op_list_update_comps(copy);

    ret = backend->compile(ctx, copy, &compiled);
    if (ret < 0) {
        int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
        av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
               backend->name, av_err2str(ret));
    } else {
        *out = compiled;
    }

    ff_sws_op_list_free(&copy);
    return ret;
}

int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
{
    for (int n = 0; ff_sws_op_backends[n]; n++) {
        const SwsOpBackend *backend = ff_sws_op_backends[n];
        if (ops->src.hw_format != backend->hw_format ||
            ops->dst.hw_format != backend->hw_format)
            continue;
        if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
            continue;

        av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
               "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
               backend->name, out->block_size, out->over_read, out->over_write,
               out->cpu_flags);

        ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops);
        return 0;
    }

    return AVERROR(ENOTSUP);
}

void ff_sws_compiled_op_unref(SwsCompiledOp *comp)
{
    if (comp->free)
        comp->free(comp->priv);

    *comp = (SwsCompiledOp) {0};
}

static void op_pass_free(void *ptr)
{
    SwsOpPass *p = ptr;
    if (!p)
        return;

    ff_sws_compiled_op_unref(&p->comp);
    av_refstruct_unref(&p->offsets_y);
    av_free(p->exec_base.in_bump_y);
    av_free(p->exec_base.in_offset_x);
    av_free(p->tail_buf);
    av_free(p);
}

static inline void get_row_data(const SwsOpPass *p, const int y_dst,
                                const uint8_t *in[4], uint8_t *out[4])
{
    const SwsOpExec *base = &p->exec_base;
    const int y_src = p->offsets_y ? p->offsets_y[y_dst] : y_dst;
    for (int i = 0; i < p->planes_in; i++)
        in[i] = base->in[i] + (y_src >> base->in_sub_y[i]) * base->in_stride[i];
    for (int i = 0; i < p->planes_out; i++)
        out[i] = base->out[i] + (y_dst >> base->out_sub_y[i]) * base->out_stride[i];
}

static int op_pass_setup(const SwsFrame *out, const SwsFrame *in,
                         const SwsPass *pass)
{
    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->format);
    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);

    SwsOpPass *p = pass->priv;
    SwsOpExec *exec = &p->exec_base;
    const SwsCompiledOp *comp = &p->comp;

    /* Set up main loop parameters */
    const int block_size = comp->block_size;
    const int num_blocks = (pass->width + block_size - 1) / block_size;
    const int aligned_w  = num_blocks * block_size;
    p->num_blocks   = num_blocks;
    p->memcpy_first = false;
    p->memcpy_last  = false;
    p->memcpy_out   = false;

    for (int i = 0; i < p->planes_in; i++) {
        const int idx        = p->idx_in[i];
        const int chroma     = idx == 1 || idx == 2;
        const int sub_x      = chroma ? indesc->log2_chroma_w : 0;
        const int sub_y      = chroma ? indesc->log2_chroma_h : 0;
        const int plane_w    = AV_CEIL_RSHIFT(aligned_w, sub_x);
        const int plane_pad  = AV_CEIL_RSHIFT(comp->over_read, sub_x);
        const int plane_size = plane_w * p->pixel_bits_in >> 3;
        const int total_size = plane_size + plane_pad;
        const int loop_size  = num_blocks * exec->block_size_in;
        if (in->linesize[idx] >= 0) {
            p->memcpy_last |= total_size > in->linesize[idx];
        } else {
            p->memcpy_first |= total_size > -in->linesize[idx];
        }
        exec->in[i]        = in->data[idx];
        exec->in_stride[i] = in->linesize[idx];
        exec->in_bump[i]   = in->linesize[idx] - loop_size;
        exec->in_sub_y[i]  = sub_y;
        exec->in_sub_x[i]  = sub_x;
    }

    for (int i = 0; i < p->planes_out; i++) {
        const int idx        = p->idx_out[i];
        const int chroma     = idx == 1 || idx == 2;
        const int sub_x      = chroma ? outdesc->log2_chroma_w : 0;
        const int sub_y      = chroma ? outdesc->log2_chroma_h : 0;
        const int plane_w    = AV_CEIL_RSHIFT(aligned_w, sub_x);
        const int plane_pad  = AV_CEIL_RSHIFT(comp->over_write, sub_x);
        const int plane_size = plane_w * p->pixel_bits_out >> 3;
        const int loop_size  = num_blocks * exec->block_size_out;
        p->memcpy_out |= plane_size + plane_pad > FFABS(out->linesize[idx]);
        exec->out[i]        = out->data[idx];
        exec->out_stride[i] = out->linesize[idx];
        exec->out_bump[i]   = out->linesize[idx] - loop_size;
        exec->out_sub_y[i]  = sub_y;
        exec->out_sub_x[i]  = sub_x;
    }

    const bool memcpy_in = p->memcpy_first || p->memcpy_last;
    if (!memcpy_in && !p->memcpy_out)
        return 0;

    /* Set-up tail section parameters and buffers */
    SwsOpExec *tail = &p->exec_tail;
    const int align = av_cpu_max_align();
    size_t alloc_size = 0;
    *tail = *exec;

    const int safe_width = (num_blocks - 1) * block_size;
    const int tail_size  = pass->width - safe_width;
    p->tail_off_out  = safe_width * p->pixel_bits_out >> 3;
    p->tail_size_out = (tail_size * p->pixel_bits_out + 7) >> 3;

    if (exec->in_offset_x) {
        p->tail_off_in  = exec->in_offset_x[safe_width];
        p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in;
        p->tail_size_in += (p->filter_size * p->pixel_bits_in + 7) >> 3;
    } else {
        p->tail_off_in  = safe_width * p->pixel_bits_in >> 3;
        p->tail_size_in = (tail_size * p->pixel_bits_in + 7) >> 3;
    }

    for (int i = 0; memcpy_in && i < p->planes_in; i++) {
        size_t block_size = (comp->block_size * p->pixel_bits_in + 7) >> 3;
        block_size += comp->over_read;
        block_size = FFMAX(block_size, p->tail_size_in);
        tail->in_stride[i] = FFALIGN(block_size, align);
        tail->in_bump[i] = tail->in_stride[i] - exec->block_size_in;
        alloc_size += tail->in_stride[i] * in->height;
    }

    for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
        size_t block_size = (comp->block_size * p->pixel_bits_out + 7) >> 3;
        block_size += comp->over_write;
        block_size = FFMAX(block_size, p->tail_size_out);
        tail->out_stride[i] = FFALIGN(block_size, align);
        tail->out_bump[i] = tail->out_stride[i] - exec->block_size_out;
        alloc_size += tail->out_stride[i] * out->height;
    }

    if (memcpy_in && exec->in_offset_x) {
        /* `in_offset_x` is indexed relative to the line start, not the start
         * of the section being processed; so we need to over-allocate this
         * array to the full width of the image, even though we will only
         * partially fill in the offsets relevant to the tail region */
        alloc_size += aligned_w * sizeof(*exec->in_offset_x);
    }

    uint8_t *tail_buf = av_fast_realloc(p->tail_buf, &p->tail_buf_size, alloc_size);
    if (!tail_buf)
        return AVERROR(ENOMEM);
    p->tail_buf = tail_buf;

    for (int i = 0; memcpy_in && i < p->planes_in; i++) {
        tail->in[i] = tail_buf;
        tail_buf += tail->in_stride[i] * in->height;
    }

    for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
        tail->out[i] = tail_buf;
        tail_buf += tail->out_stride[i] * out->height;
    }

    if (memcpy_in && exec->in_offset_x) {
        tail->in_offset_x = (int32_t *) tail_buf;
        for (int i = safe_width; i < aligned_w; i++)
            tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in;
    }

    return 0;
}

static void copy_lines(uint8_t *dst, const size_t dst_stride,
                       const uint8_t *src, const size_t src_stride,
                       const int h, const size_t bytes)
{
    for (int y = 0; y < h; y++) {
        memcpy(dst, src, bytes);
        dst += dst_stride;
        src += src_stride;
    }
}

static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y,
                        const int h, const SwsPass *pass)
{
    const SwsOpPass *p = pass->priv;
    const SwsCompiledOp *comp = &p->comp;

    /* Fill exec metadata for this slice */
    DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
    exec.slice_y = y;
    exec.slice_h = h;

    /**
     *  To ensure safety, we need to consider the following:
     *
     * 1. We can overread the input, unless this is the last line of an
     *    unpadded buffer. All defined operations can handle arbitrary pixel
     *    input, so overread of arbitrary data is fine. For flipped images,
     *    this condition is actually *inverted* to where the first line is
     *    the one at the end of the buffer.
     *
     * 2. We can overwrite the output, as long as we don't write more than the
     *    amount of pixels that fit into one linesize. So we always need to
     *    memcpy the last column on the output side if unpadded.
     */

    const bool memcpy_in  = p->memcpy_last && y + h == pass->height ||
                            p->memcpy_first && y == 0;
    const bool memcpy_out = p->memcpy_out;
    const int num_blocks  = p->num_blocks;

    get_row_data(p, y, exec.in, exec.out);
    if (!memcpy_in && !memcpy_out) {
        /* Fast path (fully aligned/padded inputs and outputs) */
        comp->func(&exec, comp->priv, 0, y, num_blocks, y + h);
        return;
    }

    /* Non-aligned case (slow path); process num_blocks - 1 main blocks and
     * a separate tail (via memcpy into an appropriately padded buffer) */
    for (int i = 0; i < 4; i++) {
        /* We process one fewer block, so the in_bump needs to be increased
         * to reflect that the plane pointers are left on the last block,
         * not the end of the processed line, after each loop iteration */
        exec.in_bump[i]  += exec.block_size_in;
        exec.out_bump[i] += exec.block_size_out;
    }

    comp->func(&exec, comp->priv, 0, y, num_blocks - 1, y + h);

    DECLARE_ALIGNED_32(SwsOpExec, tail) = p->exec_tail;
    tail.slice_y = y;
    tail.slice_h = h;

    for (int i = 0; i < p->planes_in; i++) {
        /* Input offsets are relative to the base pointer */
        if (!exec.in_offset_x || memcpy_in)
            exec.in[i] += p->tail_off_in;
        tail.in[i] += y * tail.in_stride[i];
    }
    for (int i = 0; i < p->planes_out; i++) {
        exec.out[i] += p->tail_off_out;
        tail.out[i] += y * tail.out_stride[i];
    }

    for (int i = 0; i < p->planes_in; i++) {
        if (memcpy_in) {
            copy_lines((uint8_t *) tail.in[i], tail.in_stride[i],
                       exec.in[i], exec.in_stride[i], h, p->tail_size_in);
        } else {
            /* Reuse input pointers directly */
            tail.in[i]        = exec.in[i];
            tail.in_stride[i] = exec.in_stride[i];
            tail.in_bump[i]   = exec.in_stride[i] - exec.block_size_in;
        }
    }

    for (int i = 0; !memcpy_out && i < p->planes_out; i++) {
        /* Reuse output pointers directly */
        tail.out[i]        = exec.out[i];
        tail.out_stride[i] = exec.out_stride[i];
        tail.out_bump[i]   = exec.out_stride[i] - exec.block_size_out;
    }

    /* Dispatch kernel over tail */
    comp->func(&tail, comp->priv, num_blocks - 1, y, num_blocks, y + h);

    for (int i = 0; memcpy_out && i < p->planes_out; i++) {
        copy_lines(exec.out[i], exec.out_stride[i],
                   tail.out[i], tail.out_stride[i], h, p->tail_size_out);
    }
}

static int rw_planes(const SwsOp *op)
{
    return op->rw.packed ? 1 : op->rw.elems;
}

static int rw_pixel_bits(const SwsOp *op)
{
    const int elems = op->rw.packed ? op->rw.elems : 1;
    const int size  = ff_sws_pixel_type_size(op->type);
    const int bits  = 8 >> op->rw.frac;
    av_assert1(bits >= 1);
    return elems * size * bits;
}

static int compile(SwsGraph *graph, const SwsOpList *ops, SwsPass *input,
                   SwsPass **output)
{
    SwsContext *ctx = graph->ctx;
    SwsOpPass *p = av_mallocz(sizeof(*p));
    if (!p)
        return AVERROR(ENOMEM);

    int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
    if (ret < 0)
        goto fail;

    const SwsFormat *dst = &ops->dst;
    if (p->comp.opaque) {
        SwsCompiledOp c = p->comp;
        av_free(p);
        return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
                                     input, c.slice_align, c.func_opaque,
                                     NULL, c.priv, c.free, output);
    }

    const SwsOp *read  = ff_sws_op_list_input(ops);
    const SwsOp *write = ff_sws_op_list_output(ops);
    p->planes_in  = rw_planes(read);
    p->planes_out = rw_planes(write);
    p->pixel_bits_in  = rw_pixel_bits(read);
    p->pixel_bits_out = rw_pixel_bits(write);
    p->exec_base = (SwsOpExec) {
        .width  = dst->width,
        .height = dst->height,
        .block_size_in  = p->comp.block_size * p->pixel_bits_in  >> 3,
        .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
    };

    for (int i = 0; i < 4; i++) {
        p->idx_in[i]  = i < p->planes_in  ? ops->plane_src[i] : -1;
        p->idx_out[i] = i < p->planes_out ? ops->plane_dst[i] : -1;
    }

    const SwsFilterWeights *filter = read->rw.kernel;
    if (read->rw.filter == SWS_OP_FILTER_V) {
        p->offsets_y = av_refstruct_ref(filter->offsets);

        /* Compute relative pointer bumps for each output line */
        int32_t *bump = av_malloc_array(filter->dst_size, sizeof(*bump));
        if (!bump) {
            ret = AVERROR(ENOMEM);
            goto fail;
        }

        int line = filter->offsets[0];
        for (int y = 0; y < filter->dst_size - 1; y++) {
            int next = filter->offsets[y + 1];
            bump[y] = next - line - 1;
            line = next;
        }
        bump[filter->dst_size - 1] = 0;
        p->exec_base.in_bump_y = bump;
    } else if (read->rw.filter == SWS_OP_FILTER_H) {
        /* Compute pixel offset map for each output line */
        const int pixels = FFALIGN(filter->dst_size, p->comp.block_size);
        int32_t *offset = av_malloc_array(pixels, sizeof(*offset));
        if (!offset) {
            ret = AVERROR(ENOMEM);
            goto fail;
        }

        for (int x = 0; x < filter->dst_size; x++)
            offset[x] = filter->offsets[x] * p->pixel_bits_in >> 3;
        for (int x = filter->dst_size; x < pixels; x++)
            offset[x] = offset[filter->dst_size - 1];
        p->exec_base.in_offset_x = offset;
        p->exec_base.block_size_in = 0; /* ptr does not advance */
        p->filter_size = filter->filter_size;
    }

    return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
                                 input, p->comp.slice_align, op_pass_run,
                                 op_pass_setup, p, op_pass_free, output);

fail:
    op_pass_free(p);
    return ret;
}

int ff_sws_compile_pass(SwsGraph *graph, SwsOpList **pops, int flags,
                        SwsPass *input, SwsPass **output)
{
    const int passes_orig = graph->num_passes;
    SwsContext *ctx = graph->ctx;
    SwsOpList *ops = *pops;
    int ret = 0;

    /* Check if the whole operation graph is an end-to-end no-op */
    if (ff_sws_op_list_is_noop(ops)) {
        *output = input;
        goto out;
    }

    const SwsOp *read  = ff_sws_op_list_input(ops);
    const SwsOp *write = ff_sws_op_list_output(ops);
    if (!read || !write) {
        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
               "and write, respectively.\n");
        ret = AVERROR(EINVAL);
        goto out;
    }

    if (flags & SWS_OP_FLAG_OPTIMIZE) {
        ret = ff_sws_op_list_optimize(ops);
        if (ret < 0)
            goto out;
        av_log(ctx, AV_LOG_DEBUG, "Operation list after optimizing:\n");
        ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops);
    }

    ret = compile(graph, ops, input, output);
    if (ret != AVERROR(ENOTSUP))
        goto out;

    av_log(ctx, AV_LOG_DEBUG, "Retrying with separated filter passes.\n");
    SwsPass *prev = input;
    while (ops) {
        SwsOpList *rest;
        ret = ff_sws_op_list_subpass(ops, &rest);
        if (ret < 0)
            goto out;

        if (prev == input && !rest) {
            /* No point in compiling an unsplit pass again */
            ret = AVERROR(ENOTSUP);
            goto out;
        }

        ret = compile(graph, ops, prev, &prev);
        if (ret < 0) {
            ff_sws_op_list_free(&rest);
            goto out;
        }

        ff_sws_op_list_free(&ops);
        ops = rest;
    }

    /* Return last subpass successfully compiled */
    av_log(ctx, AV_LOG_VERBOSE, "Using %d separate passes.\n",
           graph->num_passes - passes_orig);
    *output = prev;

out:
    if (ret == AVERROR(ENOTSUP)) {
        av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
        ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
    }
    if (ret < 0)
        ff_sws_graph_rollback(graph, passes_orig);
    ff_sws_op_list_free(&ops);
    *pops = NULL;
    return ret;
}

Coverage Report

Created: 2026-04-01 07:42

Line	Count	Source
1		/**
2		* Copyright (C) 2025 Niklas Haas
3		*
4		* This file is part of FFmpeg.
5		*
6		* FFmpeg is free software; you can redistribute it and/or
7		* modify it under the terms of the GNU Lesser General Public
8		* License as published by the Free Software Foundation; either
9		* version 2.1 of the License, or (at your option) any later version.
10		*
11		* FFmpeg is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14		* Lesser General Public License for more details.
15		*
16		* You should have received a copy of the GNU Lesser General Public
17		* License along with FFmpeg; if not, write to the Free Software
18		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19		*/
20
21		#include "libavutil/avassert.h"
22		#include "libavutil/cpu.h"
23		#include "libavutil/mem.h"
24		#include "libavutil/mem_internal.h"
25		#include "libavutil/refstruct.h"
26
27		#include "ops.h"
28		#include "ops_internal.h"
29		#include "ops_dispatch.h"
30
31		typedef struct SwsOpPass {
32		SwsCompiledOp comp;
33		SwsOpExec exec_base;
34		SwsOpExec exec_tail;
35		int num_blocks;
36		int tail_off_in;
37		int tail_off_out;
38		int tail_size_in;
39		int tail_size_out;
40		int planes_in;
41		int planes_out;
42		int pixel_bits_in;
43		int pixel_bits_out;
44		int idx_in[4];
45		int idx_out[4];
46		int *offsets_y;
47		int filter_size;
48		bool memcpy_first;
49		bool memcpy_last;
50		bool memcpy_out;
51		uint8_t tail_buf; / extra memory for fixing unpadded tails */
52		unsigned int tail_buf_size;
53		} SwsOpPass;
54
55		int ff_sws_ops_compile_backend(SwsContext ctx, const SwsOpBackend backend,
56		const SwsOpList ops, SwsCompiledOp out)
57	0	{
58	0	SwsOpList *copy;
59	0	SwsCompiledOp compiled = {0};
60	0	int ret = 0;
61
62	0	copy = ff_sws_op_list_duplicate(ops);
63	0	if (!copy)
64	0	return AVERROR(ENOMEM);
65
66		/* Ensure these are always set during compilation */
67	0	ff_sws_op_list_update_comps(copy);
68
69	0	ret = backend->compile(ctx, copy, &compiled);
70	0	if (ret < 0) {
71	0	int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
72	0	av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
73	0	backend->name, av_err2str(ret));
74	0	} else {
75	0	*out = compiled;
76	0	}
77
78	0	ff_sws_op_list_free(&copy);
79	0	return ret;
80	0	}
81
82		int ff_sws_ops_compile(SwsContext ctx, const SwsOpList ops, SwsCompiledOp *out)
83	0	{
84	0	for (int n = 0; ff_sws_op_backends[n]; n++) {
85	0	const SwsOpBackend *backend = ff_sws_op_backends[n];
86	0	if (ops->src.hw_format != backend->hw_format \|\|
87	0	ops->dst.hw_format != backend->hw_format)
88	0	continue;
89	0	if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
90	0	continue;
91
92	0	av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
93	0	"block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
94	0	backend->name, out->block_size, out->over_read, out->over_write,
95	0	out->cpu_flags);
96
97	0	ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops);
98	0	return 0;
99	0	}
100
101	0	return AVERROR(ENOTSUP);
102	0	}
103
104		void ff_sws_compiled_op_unref(SwsCompiledOp *comp)
105	0	{
106	0	if (comp->free)
107	0	comp->free(comp->priv);
108
109	0	*comp = (SwsCompiledOp) {0};
110	0	}
111
112		static void op_pass_free(void *ptr)
113	0	{
114	0	SwsOpPass *p = ptr;
115	0	if (!p)
116	0	return;
117
118	0	ff_sws_compiled_op_unref(&p->comp);
119	0	av_refstruct_unref(&p->offsets_y);
120	0	av_free(p->exec_base.in_bump_y);
121	0	av_free(p->exec_base.in_offset_x);
122	0	av_free(p->tail_buf);
123	0	av_free(p);
124	0	}
125
126		static inline void get_row_data(const SwsOpPass *p, const int y_dst,
127		const uint8_t in[4], uint8_t out[4])
128	0	{
129	0	const SwsOpExec *base = &p->exec_base;
130	0	const int y_src = p->offsets_y ? p->offsets_y[y_dst] : y_dst;
131	0	for (int i = 0; i < p->planes_in; i++)
132	0	in[i] = base->in[i] + (y_src >> base->in_sub_y[i]) * base->in_stride[i];
133	0	for (int i = 0; i < p->planes_out; i++)
134	0	out[i] = base->out[i] + (y_dst >> base->out_sub_y[i]) * base->out_stride[i];
135	0	}
136
137		static int op_pass_setup(const SwsFrame out, const SwsFrame in,
138		const SwsPass *pass)
139	0	{
140	0	const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format);
141	0	const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);
142
143	0	SwsOpPass *p = pass->priv;
144	0	SwsOpExec *exec = &p->exec_base;
145	0	const SwsCompiledOp *comp = &p->comp;
146
147		/* Set up main loop parameters */
148	0	const int block_size = comp->block_size;
149	0	const int num_blocks = (pass->width + block_size - 1) / block_size;
150	0	const int aligned_w = num_blocks * block_size;
151	0	p->num_blocks = num_blocks;
152	0	p->memcpy_first = false;
153	0	p->memcpy_last = false;
154	0	p->memcpy_out = false;
155
156	0	for (int i = 0; i < p->planes_in; i++) {
157	0	const int idx = p->idx_in[i];
158	0	const int chroma = idx == 1 \|\| idx == 2;
159	0	const int sub_x = chroma ? indesc->log2_chroma_w : 0;
160	0	const int sub_y = chroma ? indesc->log2_chroma_h : 0;
161	0	const int plane_w = AV_CEIL_RSHIFT(aligned_w, sub_x);
162	0	const int plane_pad = AV_CEIL_RSHIFT(comp->over_read, sub_x);
163	0	const int plane_size = plane_w * p->pixel_bits_in >> 3;
164	0	const int total_size = plane_size + plane_pad;
165	0	const int loop_size = num_blocks * exec->block_size_in;
166	0	if (in->linesize[idx] >= 0) {
167	0	p->memcpy_last \|= total_size > in->linesize[idx];
168	0	} else {
169	0	p->memcpy_first \|= total_size > -in->linesize[idx];
170	0	}
171	0	exec->in[i] = in->data[idx];
172	0	exec->in_stride[i] = in->linesize[idx];
173	0	exec->in_bump[i] = in->linesize[idx] - loop_size;
174	0	exec->in_sub_y[i] = sub_y;
175	0	exec->in_sub_x[i] = sub_x;
176	0	}
177
178	0	for (int i = 0; i < p->planes_out; i++) {
179	0	const int idx = p->idx_out[i];
180	0	const int chroma = idx == 1 \|\| idx == 2;
181	0	const int sub_x = chroma ? outdesc->log2_chroma_w : 0;
182	0	const int sub_y = chroma ? outdesc->log2_chroma_h : 0;
183	0	const int plane_w = AV_CEIL_RSHIFT(aligned_w, sub_x);
184	0	const int plane_pad = AV_CEIL_RSHIFT(comp->over_write, sub_x);
185	0	const int plane_size = plane_w * p->pixel_bits_out >> 3;
186	0	const int loop_size = num_blocks * exec->block_size_out;
187	0	p->memcpy_out \|= plane_size + plane_pad > FFABS(out->linesize[idx]);
188	0	exec->out[i] = out->data[idx];
189	0	exec->out_stride[i] = out->linesize[idx];
190	0	exec->out_bump[i] = out->linesize[idx] - loop_size;
191	0	exec->out_sub_y[i] = sub_y;
192	0	exec->out_sub_x[i] = sub_x;
193	0	}
194
195	0	const bool memcpy_in = p->memcpy_first \|\| p->memcpy_last;
196	0	if (!memcpy_in && !p->memcpy_out)
197	0	return 0;
198
199		/* Set-up tail section parameters and buffers */
200	0	SwsOpExec *tail = &p->exec_tail;
201	0	const int align = av_cpu_max_align();
202	0	size_t alloc_size = 0;
203	0	tail = exec;
204
205	0	const int safe_width = (num_blocks - 1) * block_size;
206	0	const int tail_size = pass->width - safe_width;
207	0	p->tail_off_out = safe_width * p->pixel_bits_out >> 3;
208	0	p->tail_size_out = (tail_size * p->pixel_bits_out + 7) >> 3;
209
210	0	if (exec->in_offset_x) {
211	0	p->tail_off_in = exec->in_offset_x[safe_width];
212	0	p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in;
213	0	p->tail_size_in += (p->filter_size * p->pixel_bits_in + 7) >> 3;
214	0	} else {
215	0	p->tail_off_in = safe_width * p->pixel_bits_in >> 3;
216	0	p->tail_size_in = (tail_size * p->pixel_bits_in + 7) >> 3;
217	0	}
218
219	0	for (int i = 0; memcpy_in && i < p->planes_in; i++) {
220	0	size_t block_size = (comp->block_size * p->pixel_bits_in + 7) >> 3;
221	0	block_size += comp->over_read;
222	0	block_size = FFMAX(block_size, p->tail_size_in);
223	0	tail->in_stride[i] = FFALIGN(block_size, align);
224	0	tail->in_bump[i] = tail->in_stride[i] - exec->block_size_in;
225	0	alloc_size += tail->in_stride[i] * in->height;
226	0	}
227
228	0	for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
229	0	size_t block_size = (comp->block_size * p->pixel_bits_out + 7) >> 3;
230	0	block_size += comp->over_write;
231	0	block_size = FFMAX(block_size, p->tail_size_out);
232	0	tail->out_stride[i] = FFALIGN(block_size, align);
233	0	tail->out_bump[i] = tail->out_stride[i] - exec->block_size_out;
234	0	alloc_size += tail->out_stride[i] * out->height;
235	0	}
236
237	0	if (memcpy_in && exec->in_offset_x) {
238		/* `in_offset_x` is indexed relative to the line start, not the start
239		* of the section being processed; so we need to over-allocate this
240		* array to the full width of the image, even though we will only
241		* partially fill in the offsets relevant to the tail region */
242	0	alloc_size += aligned_w * sizeof(*exec->in_offset_x);
243	0	}
244
245	0	uint8_t *tail_buf = av_fast_realloc(p->tail_buf, &p->tail_buf_size, alloc_size);
246	0	if (!tail_buf)
247	0	return AVERROR(ENOMEM);
248	0	p->tail_buf = tail_buf;
249
250	0	for (int i = 0; memcpy_in && i < p->planes_in; i++) {
251	0	tail->in[i] = tail_buf;
252	0	tail_buf += tail->in_stride[i] * in->height;
253	0	}
254
255	0	for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
256	0	tail->out[i] = tail_buf;
257	0	tail_buf += tail->out_stride[i] * out->height;
258	0	}
259
260	0	if (memcpy_in && exec->in_offset_x) {
261	0	tail->in_offset_x = (int32_t *) tail_buf;
262	0	for (int i = safe_width; i < aligned_w; i++)
263	0	tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in;
264	0	}
265
266	0	return 0;
267	0	}
268
269		static void copy_lines(uint8_t *dst, const size_t dst_stride,
270		const uint8_t *src, const size_t src_stride,
271		const int h, const size_t bytes)
272	0	{
273	0	for (int y = 0; y < h; y++) {
274	0	memcpy(dst, src, bytes);
275	0	dst += dst_stride;
276	0	src += src_stride;
277	0	}
278	0	}
279
280		static void op_pass_run(const SwsFrame out, const SwsFrame in, const int y,
281		const int h, const SwsPass *pass)
282	0	{
283	0	const SwsOpPass *p = pass->priv;
284	0	const SwsCompiledOp *comp = &p->comp;
285
286		/* Fill exec metadata for this slice */
287	0	DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
288	0	exec.slice_y = y;
289	0	exec.slice_h = h;
290
291		/**
292		* To ensure safety, we need to consider the following:
293		*
294		* 1. We can overread the input, unless this is the last line of an
295		* unpadded buffer. All defined operations can handle arbitrary pixel
296		* input, so overread of arbitrary data is fine. For flipped images,
297		* this condition is actually inverted to where the first line is
298		* the one at the end of the buffer.
299		*
300		* 2. We can overwrite the output, as long as we don't write more than the
301		* amount of pixels that fit into one linesize. So we always need to
302		* memcpy the last column on the output side if unpadded.
303		*/
304
305	0	const bool memcpy_in = p->memcpy_last && y + h == pass->height \|\|
306	0	p->memcpy_first && y == 0;
307	0	const bool memcpy_out = p->memcpy_out;
308	0	const int num_blocks = p->num_blocks;
309
310	0	get_row_data(p, y, exec.in, exec.out);
311	0	if (!memcpy_in && !memcpy_out) {
312		/* Fast path (fully aligned/padded inputs and outputs) */
313	0	comp->func(&exec, comp->priv, 0, y, num_blocks, y + h);
314	0	return;
315	0	}
316
317		/* Non-aligned case (slow path); process num_blocks - 1 main blocks and
318		* a separate tail (via memcpy into an appropriately padded buffer) */
319	0	for (int i = 0; i < 4; i++) {
320		/* We process one fewer block, so the in_bump needs to be increased
321		* to reflect that the plane pointers are left on the last block,
322		* not the end of the processed line, after each loop iteration */
323	0	exec.in_bump[i] += exec.block_size_in;
324	0	exec.out_bump[i] += exec.block_size_out;
325	0	}
326
327	0	comp->func(&exec, comp->priv, 0, y, num_blocks - 1, y + h);
328
329	0	DECLARE_ALIGNED_32(SwsOpExec, tail) = p->exec_tail;
330	0	tail.slice_y = y;
331	0	tail.slice_h = h;
332
333	0	for (int i = 0; i < p->planes_in; i++) {
334		/* Input offsets are relative to the base pointer */
335	0	if (!exec.in_offset_x \|\| memcpy_in)
336	0	exec.in[i] += p->tail_off_in;
337	0	tail.in[i] += y * tail.in_stride[i];
338	0	}
339	0	for (int i = 0; i < p->planes_out; i++) {
340	0	exec.out[i] += p->tail_off_out;
341	0	tail.out[i] += y * tail.out_stride[i];
342	0	}
343
344	0	for (int i = 0; i < p->planes_in; i++) {
345	0	if (memcpy_in) {
346	0	copy_lines((uint8_t *) tail.in[i], tail.in_stride[i],
347	0	exec.in[i], exec.in_stride[i], h, p->tail_size_in);
348	0	} else {
349		/* Reuse input pointers directly */
350	0	tail.in[i] = exec.in[i];
351	0	tail.in_stride[i] = exec.in_stride[i];
352	0	tail.in_bump[i] = exec.in_stride[i] - exec.block_size_in;
353	0	}
354	0	}
355
356	0	for (int i = 0; !memcpy_out && i < p->planes_out; i++) {
357		/* Reuse output pointers directly */
358	0	tail.out[i] = exec.out[i];
359	0	tail.out_stride[i] = exec.out_stride[i];
360	0	tail.out_bump[i] = exec.out_stride[i] - exec.block_size_out;
361	0	}
362
363		/* Dispatch kernel over tail */
364	0	comp->func(&tail, comp->priv, num_blocks - 1, y, num_blocks, y + h);
365
366	0	for (int i = 0; memcpy_out && i < p->planes_out; i++) {
367	0	copy_lines(exec.out[i], exec.out_stride[i],
368	0	tail.out[i], tail.out_stride[i], h, p->tail_size_out);
369	0	}
370	0	}
371
372		static int rw_planes(const SwsOp *op)
373	0	{
374	0	return op->rw.packed ? 1 : op->rw.elems;
375	0	}
376
377		static int rw_pixel_bits(const SwsOp *op)
378	0	{
379	0	const int elems = op->rw.packed ? op->rw.elems : 1;
380	0	const int size = ff_sws_pixel_type_size(op->type);
381	0	const int bits = 8 >> op->rw.frac;
382	0	av_assert1(bits >= 1);
383	0	return elems * size * bits;
384	0	}
385
386		static int compile(SwsGraph graph, const SwsOpList ops, SwsPass *input,
387		SwsPass **output)
388	0	{
389	0	SwsContext *ctx = graph->ctx;
390	0	SwsOpPass p = av_mallocz(sizeof(p));
391	0	if (!p)
392	0	return AVERROR(ENOMEM);
393
394	0	int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
395	0	if (ret < 0)
396	0	goto fail;
397
398	0	const SwsFormat *dst = &ops->dst;
399	0	if (p->comp.opaque) {
400	0	SwsCompiledOp c = p->comp;
401	0	av_free(p);
402	0	return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
403	0	input, c.slice_align, c.func_opaque,
404	0	NULL, c.priv, c.free, output);
405	0	}
406
407	0	const SwsOp *read = ff_sws_op_list_input(ops);
408	0	const SwsOp *write = ff_sws_op_list_output(ops);
409	0	p->planes_in = rw_planes(read);
410	0	p->planes_out = rw_planes(write);
411	0	p->pixel_bits_in = rw_pixel_bits(read);
412	0	p->pixel_bits_out = rw_pixel_bits(write);
413	0	p->exec_base = (SwsOpExec) {
414	0	.width = dst->width,
415	0	.height = dst->height,
416	0	.block_size_in = p->comp.block_size * p->pixel_bits_in >> 3,
417	0	.block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
418	0	};
419
420	0	for (int i = 0; i < 4; i++) {
421	0	p->idx_in[i] = i < p->planes_in ? ops->plane_src[i] : -1;
422	0	p->idx_out[i] = i < p->planes_out ? ops->plane_dst[i] : -1;
423	0	}
424
425	0	const SwsFilterWeights *filter = read->rw.kernel;
426	0	if (read->rw.filter == SWS_OP_FILTER_V) {
427	0	p->offsets_y = av_refstruct_ref(filter->offsets);
428
429		/* Compute relative pointer bumps for each output line */
430	0	int32_t bump = av_malloc_array(filter->dst_size, sizeof(bump));
431	0	if (!bump) {
432	0	ret = AVERROR(ENOMEM);
433	0	goto fail;
434	0	}
435
436	0	int line = filter->offsets[0];
437	0	for (int y = 0; y < filter->dst_size - 1; y++) {
438	0	int next = filter->offsets[y + 1];
439	0	bump[y] = next - line - 1;
440	0	line = next;
441	0	}
442	0	bump[filter->dst_size - 1] = 0;
443	0	p->exec_base.in_bump_y = bump;
444	0	} else if (read->rw.filter == SWS_OP_FILTER_H) {
445		/* Compute pixel offset map for each output line */
446	0	const int pixels = FFALIGN(filter->dst_size, p->comp.block_size);
447	0	int32_t offset = av_malloc_array(pixels, sizeof(offset));
448	0	if (!offset) {
449	0	ret = AVERROR(ENOMEM);
450	0	goto fail;
451	0	}
452
453	0	for (int x = 0; x < filter->dst_size; x++)
454	0	offset[x] = filter->offsets[x] * p->pixel_bits_in >> 3;
455	0	for (int x = filter->dst_size; x < pixels; x++)
456	0	offset[x] = offset[filter->dst_size - 1];
457	0	p->exec_base.in_offset_x = offset;
458	0	p->exec_base.block_size_in = 0; /* ptr does not advance */
459	0	p->filter_size = filter->filter_size;
460	0	}
461
462	0	return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
463	0	input, p->comp.slice_align, op_pass_run,
464	0	op_pass_setup, p, op_pass_free, output);
465
466	0	fail:
467	0	op_pass_free(p);
468	0	return ret;
469	0	}
470
471		int ff_sws_compile_pass(SwsGraph graph, SwsOpList *pops, int flags,
472		SwsPass input, SwsPass *output)
473	0	{
474	0	const int passes_orig = graph->num_passes;
475	0	SwsContext *ctx = graph->ctx;
476	0	SwsOpList ops = pops;
477	0	int ret = 0;
478
479		/* Check if the whole operation graph is an end-to-end no-op */
480	0	if (ff_sws_op_list_is_noop(ops)) {
481	0	*output = input;
482	0	goto out;
483	0	}
484
485	0	const SwsOp *read = ff_sws_op_list_input(ops);
486	0	const SwsOp *write = ff_sws_op_list_output(ops);
487	0	if (!read \|\| !write) {
488	0	av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
489	0	"and write, respectively.\n");
490	0	ret = AVERROR(EINVAL);
491	0	goto out;
492	0	}
493
494	0	if (flags & SWS_OP_FLAG_OPTIMIZE) {
495	0	ret = ff_sws_op_list_optimize(ops);
496	0	if (ret < 0)
497	0	goto out;
498	0	av_log(ctx, AV_LOG_DEBUG, "Operation list after optimizing:\n");
499	0	ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops);
500	0	}
501
502	0	ret = compile(graph, ops, input, output);
503	0	if (ret != AVERROR(ENOTSUP))
504	0	goto out;
505
506	0	av_log(ctx, AV_LOG_DEBUG, "Retrying with separated filter passes.\n");
507	0	SwsPass *prev = input;
508	0	while (ops) {
509	0	SwsOpList *rest;
510	0	ret = ff_sws_op_list_subpass(ops, &rest);
511	0	if (ret < 0)
512	0	goto out;
513
514	0	if (prev == input && !rest) {
515		/* No point in compiling an unsplit pass again */
516	0	ret = AVERROR(ENOTSUP);
517	0	goto out;
518	0	}
519
520	0	ret = compile(graph, ops, prev, &prev);
521	0	if (ret < 0) {
522	0	ff_sws_op_list_free(&rest);
523	0	goto out;
524	0	}
525
526	0	ff_sws_op_list_free(&ops);
527	0	ops = rest;
528	0	}
529
530		/* Return last subpass successfully compiled */
531	0	av_log(ctx, AV_LOG_VERBOSE, "Using %d separate passes.\n",
532	0	graph->num_passes - passes_orig);
533	0	*output = prev;
534
535	0	out:
536	0	if (ret == AVERROR(ENOTSUP)) {
537	0	av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
538	0	ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
539	0	}
540	0	if (ret < 0)
541	0	ff_sws_graph_rollback(graph, passes_orig);
542	0	ff_sws_op_list_free(&ops);
543		*pops = NULL;
544	0	return ret;
545	0	}