Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libswscale/x86/ops.c
Line
Count
Source
1
/**
2
 * Copyright (C) 2025 Niklas Haas
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
21
#include <float.h>
22
23
#include "libavutil/avassert.h"
24
#include "libavutil/mem.h"
25
26
#include "../ops_chain.h"
27
28
#define DECL_ENTRY(TYPE, NAME, ...)                                             \
29
    static const SwsOpEntry op_##NAME = {                                       \
30
        .type = SWS_PIXEL_##TYPE,                                               \
31
        __VA_ARGS__                                                             \
32
    }
33
34
#define DECL_ASM(TYPE, NAME, ...)                                               \
35
    void ff_##NAME(void);                                                       \
36
    DECL_ENTRY(TYPE, NAME,                                                      \
37
        .func = ff_##NAME,                                                      \
38
        __VA_ARGS__)
39
40
#define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...)                               \
41
    DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME,                                      \
42
        .unused = { !X, !Y, !Z, !W },                                           \
43
        __VA_ARGS__                                                             \
44
    )
45
46
#define REF_PATTERN(NAME, X, Y, Z, W)                                           \
47
    &op_p##X##Y##Z##W##_##NAME
48
49
#define DECL_COMMON_PATTERNS(TYPE, NAME, ...)                                   \
50
    DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__);                          \
51
    DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__);                          \
52
    DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__);                          \
53
    DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__)                           \
54
55
#define REF_COMMON_PATTERNS(NAME)                                               \
56
    REF_PATTERN(NAME, 1, 0, 0, 0),                                              \
57
    REF_PATTERN(NAME, 1, 0, 0, 1),                                              \
58
    REF_PATTERN(NAME, 1, 1, 1, 0),                                              \
59
    REF_PATTERN(NAME, 1, 1, 1, 1)
60
61
#define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC)                       \
62
    DECL_ASM(TYPE, NAME##ELEMS##EXT,                                            \
63
        .op = SWS_OP_##OP,                                                      \
64
        .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC },               \
65
    );
66
67
#define DECL_PACKED_RW(EXT, DEPTH)                                              \
68
    DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed,  READ,  2, true,  0)           \
69
    DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed,  READ,  3, true,  0)           \
70
    DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed,  READ,  4, true,  0)           \
71
    DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true,  0)           \
72
    DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true,  0)           \
73
    DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true,  0)           \
74
75
#define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W)                                 \
76
    DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT,                                      \
77
        .op = SWS_OP_PACK,                                                      \
78
        .pack.pattern = {X, Y, Z, W},                                           \
79
    );                                                                          \
80
                                                                                \
81
    DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT,                                    \
82
        .op = SWS_OP_UNPACK,                                                    \
83
        .pack.pattern = {X, Y, Z, W},                                           \
84
    );                                                                          \
85
86
static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
87
0
{
88
0
    const int mask = ff_sws_pixel_type_size(op->type) - 1;
89
0
    for (int i = 0; i < 16; i++)
90
0
        out->u8[i] = (i & ~mask) | (mask - (i & mask));
91
0
    return 0;
92
0
}
93
94
#define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W)                                  \
95
    DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT,                    \
96
        .op = SWS_OP_SWAP_BYTES,                                                \
97
        .unused = { !X, !Y, !Z, !W },                                           \
98
        .func = ff_p##X##Y##Z##W##_shuffle##EXT,                                \
99
        .setup = setup_swap_bytes,                                              \
100
    );
101
102
#define DECL_CLEAR_ALPHA(EXT, IDX)                                              \
103
    DECL_ASM(U8, clear_alpha##IDX##EXT,                                         \
104
        .op = SWS_OP_CLEAR,                                                     \
105
        .clear_value = -1,                                                      \
106
        .unused[IDX] = true,                                                    \
107
    );                                                                          \
108
109
#define DECL_CLEAR_ZERO(EXT, IDX)                                               \
110
    DECL_ASM(U8, clear_zero##IDX##EXT,                                          \
111
        .op = SWS_OP_CLEAR,                                                     \
112
        .clear_value = 0,                                                       \
113
        .unused[IDX] = true,                                                    \
114
    );
115
116
static int setup_clear(const SwsOp *op, SwsOpPriv *out)
117
0
{
118
0
    for (int i = 0; i < 4; i++)
119
0
        out->u32[i] = (uint32_t) op->c.q4[i].num;
120
0
    return 0;
121
0
}
122
123
#define DECL_CLEAR(EXT, X, Y, Z, W)                                             \
124
    DECL_PATTERN(U8, clear##EXT, X, Y, Z, W,                                    \
125
        .op = SWS_OP_CLEAR,                                                     \
126
        .setup = setup_clear,                                                   \
127
        .flexible = true,                                                       \
128
    );
129
130
#define DECL_SWIZZLE(EXT, X, Y, Z, W)                                           \
131
    DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT,                                     \
132
        .op = SWS_OP_SWIZZLE,                                                   \
133
        .swizzle.in = {X, Y, Z, W},                                             \
134
    );
135
136
#define DECL_CONVERT(EXT, FROM, TO)                                             \
137
    DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT,                      \
138
        .op = SWS_OP_CONVERT,                                                   \
139
        .convert.to = SWS_PIXEL_##TO,                                           \
140
    );
141
142
#define DECL_EXPAND(EXT, FROM, TO)                                              \
143
    DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT,                       \
144
        .op = SWS_OP_CONVERT,                                                   \
145
        .convert.to = SWS_PIXEL_##TO,                                           \
146
        .convert.expand = true,                                                 \
147
    );
148
149
static int setup_shift(const SwsOp *op, SwsOpPriv *out)
150
0
{
151
0
    out->u16[0] = op->c.u;
152
0
    return 0;
153
0
}
154
155
#define DECL_SHIFT16(EXT)                                                       \
156
    DECL_COMMON_PATTERNS(U16, lshift16##EXT,                                    \
157
        .op = SWS_OP_LSHIFT,                                                    \
158
        .setup = setup_shift,                                                   \
159
        .flexible = true,                                                       \
160
    );                                                                          \
161
                                                                                \
162
    DECL_COMMON_PATTERNS(U16, rshift16##EXT,                                    \
163
        .op = SWS_OP_RSHIFT,                                                    \
164
        .setup = setup_shift,                                                   \
165
        .flexible = true,                                                       \
166
    );
167
168
#define DECL_MIN_MAX(EXT)                                                       \
169
    DECL_COMMON_PATTERNS(F32, min##EXT,                                         \
170
        .op = SWS_OP_MIN,                                                       \
171
        .setup = ff_sws_setup_q4,                                               \
172
        .flexible = true,                                                       \
173
    );                                                                          \
174
                                                                                \
175
    DECL_COMMON_PATTERNS(F32, max##EXT,                                         \
176
        .op = SWS_OP_MAX,                                                       \
177
        .setup = ff_sws_setup_q4,                                               \
178
        .flexible = true,                                                       \
179
    );
180
181
#define DECL_SCALE(EXT)                                                         \
182
    DECL_COMMON_PATTERNS(F32, scale##EXT,                                       \
183
        .op = SWS_OP_SCALE,                                                     \
184
        .setup = ff_sws_setup_q,                                                \
185
    );
186
187
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
188
0
{
189
    /* 1x1 matrix / single constant */
190
0
    if (!op->dither.size_log2) {
191
0
        const AVRational k = op->dither.matrix[0];
192
0
        out->f32[0] = (float) k.num / k.den;
193
0
        return 0;
194
0
    }
195
196
0
    const int size = 1 << op->dither.size_log2;
197
0
    int max_offset = 0;
198
0
    for (int i = 0; i < 4; i++) {
199
0
        const int offset = op->dither.y_offset[i] & (size - 1);
200
0
        max_offset = FFMAX(max_offset, offset);
201
0
    }
202
203
    /* Allocate extra rows to allow over-reading for row offsets. Note that
204
     * max_offset is currently never larger than 5, so the extra space needed
205
     * for this over-allocation is bounded by 5 * size * sizeof(float),
206
     * typically 320 bytes for a 16x16 dither matrix. */
207
0
    const int stride = size * sizeof(float);
208
0
    const int num_rows = size + max_offset;
209
0
    float *matrix = out->ptr = av_mallocz(num_rows * stride);
210
0
    if (!matrix)
211
0
        return AVERROR(ENOMEM);
212
213
0
    for (int i = 0; i < size * size; i++)
214
0
        matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
215
216
0
    memcpy(&matrix[size * size], matrix, max_offset * stride);
217
218
    /* Store relative pointer offset to each row inside extra space */
219
0
    static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers not supported");
220
0
    assert(max_offset * stride <= UINT16_MAX);
221
0
    uint16_t *offset = &out->u16[4];
222
0
    for (int i = 0; i < 4; i++)
223
0
        offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride;
224
225
0
    return 0;
226
0
}
227
228
#define DECL_DITHER(EXT, SIZE)                                                  \
229
    DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT,                                \
230
        .op    = SWS_OP_DITHER,                                                 \
231
        .setup = setup_dither,                                                  \
232
        .free  = (SIZE) ? av_free : NULL,                                       \
233
        .dither_size = SIZE,                                                    \
234
    );
235
236
static int setup_linear(const SwsOp *op, SwsOpPriv *out)
237
0
{
238
0
    float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
239
0
    if (!matrix)
240
0
        return AVERROR(ENOMEM);
241
242
0
    for (int y = 0; y < 4; y++) {
243
0
        for (int x = 0; x < 5; x++)
244
0
            matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
245
0
    }
246
247
0
    return 0;
248
0
}
249
250
#define DECL_LINEAR(EXT, NAME, MASK)                                            \
251
    DECL_ASM(F32, NAME##EXT,                                                    \
252
        .op    = SWS_OP_LINEAR,                                                 \
253
        .setup = setup_linear,                                                  \
254
        .free  = av_free,                                                       \
255
        .linear_mask = (MASK),                                                  \
256
    );
257
258
#define DECL_FUNCS_8(SIZE, EXT, FLAG)                                           \
259
    DECL_RW(EXT, U8, read_planar,   READ,  1, false, 0)                         \
260
    DECL_RW(EXT, U8, read_planar,   READ,  2, false, 0)                         \
261
    DECL_RW(EXT, U8, read_planar,   READ,  3, false, 0)                         \
262
    DECL_RW(EXT, U8, read_planar,   READ,  4, false, 0)                         \
263
    DECL_RW(EXT, U8, write_planar,  WRITE, 1, false, 0)                         \
264
    DECL_RW(EXT, U8, write_planar,  WRITE, 2, false, 0)                         \
265
    DECL_RW(EXT, U8, write_planar,  WRITE, 3, false, 0)                         \
266
    DECL_RW(EXT, U8, write_planar,  WRITE, 4, false, 0)                         \
267
    DECL_RW(EXT, U8, read_nibbles,  READ,  1, false, 1)                         \
268
    DECL_RW(EXT, U8, read_bits,     READ,  1, false, 3)                         \
269
    DECL_RW(EXT, U8, write_bits,    WRITE, 1, false, 3)                         \
270
    DECL_PACKED_RW(EXT, 8)                                                      \
271
    DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0)                                       \
272
    DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0)                                       \
273
    DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0)                                       \
274
    void ff_p1000_shuffle##EXT(void);                                           \
275
    void ff_p1001_shuffle##EXT(void);                                           \
276
    void ff_p1110_shuffle##EXT(void);                                           \
277
    void ff_p1111_shuffle##EXT(void);                                           \
278
    DECL_SWIZZLE(EXT, 3, 0, 1, 2)                                               \
279
    DECL_SWIZZLE(EXT, 3, 0, 2, 1)                                               \
280
    DECL_SWIZZLE(EXT, 2, 1, 0, 3)                                               \
281
    DECL_SWIZZLE(EXT, 3, 2, 1, 0)                                               \
282
    DECL_SWIZZLE(EXT, 3, 1, 0, 2)                                               \
283
    DECL_SWIZZLE(EXT, 3, 2, 0, 1)                                               \
284
    DECL_SWIZZLE(EXT, 1, 2, 0, 3)                                               \
285
    DECL_SWIZZLE(EXT, 1, 0, 2, 3)                                               \
286
    DECL_SWIZZLE(EXT, 2, 0, 1, 3)                                               \
287
    DECL_SWIZZLE(EXT, 2, 3, 1, 0)                                               \
288
    DECL_SWIZZLE(EXT, 2, 1, 3, 0)                                               \
289
    DECL_SWIZZLE(EXT, 1, 2, 3, 0)                                               \
290
    DECL_SWIZZLE(EXT, 1, 3, 2, 0)                                               \
291
    DECL_SWIZZLE(EXT, 0, 2, 1, 3)                                               \
292
    DECL_SWIZZLE(EXT, 0, 2, 3, 1)                                               \
293
    DECL_SWIZZLE(EXT, 0, 3, 1, 2)                                               \
294
    DECL_SWIZZLE(EXT, 3, 1, 2, 0)                                               \
295
    DECL_SWIZZLE(EXT, 0, 3, 2, 1)                                               \
296
    DECL_SWIZZLE(EXT, 0, 0, 0, 3)                                               \
297
    DECL_SWIZZLE(EXT, 3, 0, 0, 0)                                               \
298
    DECL_SWIZZLE(EXT, 0, 0, 0, 1)                                               \
299
    DECL_SWIZZLE(EXT, 1, 0, 0, 0)                                               \
300
    DECL_CLEAR_ALPHA(EXT, 0)                                                    \
301
    DECL_CLEAR_ALPHA(EXT, 1)                                                    \
302
    DECL_CLEAR_ALPHA(EXT, 3)                                                    \
303
    DECL_CLEAR_ZERO(EXT, 0)                                                     \
304
    DECL_CLEAR_ZERO(EXT, 1)                                                     \
305
    DECL_CLEAR_ZERO(EXT, 3)                                                     \
306
    DECL_CLEAR(EXT, 1, 1, 1, 0)                                                 \
307
    DECL_CLEAR(EXT, 0, 1, 1, 1)                                                 \
308
    DECL_CLEAR(EXT, 0, 0, 1, 1)                                                 \
309
    DECL_CLEAR(EXT, 1, 0, 0, 1)                                                 \
310
    DECL_CLEAR(EXT, 1, 1, 0, 0)                                                 \
311
    DECL_CLEAR(EXT, 0, 1, 0, 1)                                                 \
312
    DECL_CLEAR(EXT, 1, 0, 1, 0)                                                 \
313
    DECL_CLEAR(EXT, 1, 0, 0, 0)                                                 \
314
    DECL_CLEAR(EXT, 0, 1, 0, 0)                                                 \
315
    DECL_CLEAR(EXT, 0, 0, 1, 0)                                                 \
316
                                                                                \
317
static const SwsOpTable ops8##EXT = {                                           \
318
    .cpu_flags = AV_CPU_FLAG_##FLAG,                                            \
319
    .block_size = SIZE,                                                         \
320
    .entries = {                                                                \
321
        &op_read_planar1##EXT,                                                  \
322
        &op_read_planar2##EXT,                                                  \
323
        &op_read_planar3##EXT,                                                  \
324
        &op_read_planar4##EXT,                                                  \
325
        &op_write_planar1##EXT,                                                 \
326
        &op_write_planar2##EXT,                                                 \
327
        &op_write_planar3##EXT,                                                 \
328
        &op_write_planar4##EXT,                                                 \
329
        &op_read8_packed2##EXT,                                                 \
330
        &op_read8_packed3##EXT,                                                 \
331
        &op_read8_packed4##EXT,                                                 \
332
        &op_write8_packed2##EXT,                                                \
333
        &op_write8_packed3##EXT,                                                \
334
        &op_write8_packed4##EXT,                                                \
335
        &op_read_nibbles1##EXT,                                                 \
336
        &op_read_bits1##EXT,                                                    \
337
        &op_write_bits1##EXT,                                                   \
338
        &op_pack_1210##EXT,                                                     \
339
        &op_pack_3320##EXT,                                                     \
340
        &op_pack_2330##EXT,                                                     \
341
        &op_unpack_1210##EXT,                                                   \
342
        &op_unpack_3320##EXT,                                                   \
343
        &op_unpack_2330##EXT,                                                   \
344
        &op_swizzle_3012##EXT,                                                  \
345
        &op_swizzle_3021##EXT,                                                  \
346
        &op_swizzle_2103##EXT,                                                  \
347
        &op_swizzle_3210##EXT,                                                  \
348
        &op_swizzle_3102##EXT,                                                  \
349
        &op_swizzle_3201##EXT,                                                  \
350
        &op_swizzle_1203##EXT,                                                  \
351
        &op_swizzle_1023##EXT,                                                  \
352
        &op_swizzle_2013##EXT,                                                  \
353
        &op_swizzle_2310##EXT,                                                  \
354
        &op_swizzle_2130##EXT,                                                  \
355
        &op_swizzle_1230##EXT,                                                  \
356
        &op_swizzle_1320##EXT,                                                  \
357
        &op_swizzle_0213##EXT,                                                  \
358
        &op_swizzle_0231##EXT,                                                  \
359
        &op_swizzle_0312##EXT,                                                  \
360
        &op_swizzle_3120##EXT,                                                  \
361
        &op_swizzle_0321##EXT,                                                  \
362
        &op_swizzle_0003##EXT,                                                  \
363
        &op_swizzle_0001##EXT,                                                  \
364
        &op_swizzle_3000##EXT,                                                  \
365
        &op_swizzle_1000##EXT,                                                  \
366
        &op_clear_alpha0##EXT,                                                  \
367
        &op_clear_alpha1##EXT,                                                  \
368
        &op_clear_alpha3##EXT,                                                  \
369
        &op_clear_zero0##EXT,                                                   \
370
        &op_clear_zero1##EXT,                                                   \
371
        &op_clear_zero3##EXT,                                                   \
372
        REF_PATTERN(clear##EXT, 1, 1, 1, 0),                                    \
373
        REF_PATTERN(clear##EXT, 0, 1, 1, 1),                                    \
374
        REF_PATTERN(clear##EXT, 0, 0, 1, 1),                                    \
375
        REF_PATTERN(clear##EXT, 1, 0, 0, 1),                                    \
376
        REF_PATTERN(clear##EXT, 1, 1, 0, 0),                                    \
377
        REF_PATTERN(clear##EXT, 0, 1, 0, 1),                                    \
378
        REF_PATTERN(clear##EXT, 1, 0, 1, 0),                                    \
379
        REF_PATTERN(clear##EXT, 1, 0, 0, 0),                                    \
380
        REF_PATTERN(clear##EXT, 0, 1, 0, 0),                                    \
381
        REF_PATTERN(clear##EXT, 0, 0, 1, 0),                                    \
382
        NULL                                                                    \
383
    },                                                                          \
384
};
385
386
#define DECL_FUNCS_16(SIZE, EXT, FLAG)                                          \
387
    DECL_PACKED_RW(EXT, 16)                                                     \
388
    DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0)                                      \
389
    DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0)                                      \
390
    DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0)                                      \
391
    DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0)                                       \
392
    DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1)                                       \
393
    DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0)                                       \
394
    DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1)                                       \
395
    DECL_SHIFT16(EXT)                                                           \
396
    DECL_CONVERT(EXT,  U8, U16)                                                 \
397
    DECL_CONVERT(EXT, U16,  U8)                                                 \
398
    DECL_EXPAND(EXT,   U8, U16)                                                 \
399
                                                                                \
400
static const SwsOpTable ops16##EXT = {                                          \
401
    .cpu_flags = AV_CPU_FLAG_##FLAG,                                            \
402
    .block_size = SIZE,                                                         \
403
    .entries = {                                                                \
404
        &op_read16_packed2##EXT,                                                \
405
        &op_read16_packed3##EXT,                                                \
406
        &op_read16_packed4##EXT,                                                \
407
        &op_write16_packed2##EXT,                                               \
408
        &op_write16_packed3##EXT,                                               \
409
        &op_write16_packed4##EXT,                                               \
410
        &op_pack_4440##EXT,                                                     \
411
        &op_pack_5550##EXT,                                                     \
412
        &op_pack_5650##EXT,                                                     \
413
        &op_unpack_4440##EXT,                                                   \
414
        &op_unpack_5550##EXT,                                                   \
415
        &op_unpack_5650##EXT,                                                   \
416
        REF_COMMON_PATTERNS(swap_bytes_U16##EXT),                               \
417
        REF_COMMON_PATTERNS(convert_U8_U16##EXT),                               \
418
        REF_COMMON_PATTERNS(convert_U16_U8##EXT),                               \
419
        REF_COMMON_PATTERNS(expand_U8_U16##EXT),                                \
420
        REF_COMMON_PATTERNS(lshift16##EXT),                                     \
421
        REF_COMMON_PATTERNS(rshift16##EXT),                                     \
422
        NULL                                                                    \
423
    },                                                                          \
424
};
425
426
#define DECL_FUNCS_32(SIZE, EXT, FLAG)                                          \
427
    DECL_PACKED_RW(_m2##EXT, 32)                                                \
428
    DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2)                              \
429
    DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10)                              \
430
    DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0)                                  \
431
    DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1)                                  \
432
    DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0)                                  \
433
    DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1)                                  \
434
    DECL_CONVERT(EXT,  U8, U32)                                                 \
435
    DECL_CONVERT(EXT, U32,  U8)                                                 \
436
    DECL_CONVERT(EXT, U16, U32)                                                 \
437
    DECL_CONVERT(EXT, U32, U16)                                                 \
438
    DECL_CONVERT(EXT,  U8, F32)                                                 \
439
    DECL_CONVERT(EXT, F32,  U8)                                                 \
440
    DECL_CONVERT(EXT, U16, F32)                                                 \
441
    DECL_CONVERT(EXT, F32, U16)                                                 \
442
    DECL_EXPAND(EXT,   U8, U32)                                                 \
443
    DECL_MIN_MAX(EXT)                                                           \
444
    DECL_SCALE(EXT)                                                             \
445
    DECL_DITHER(EXT, 0)                                                         \
446
    DECL_DITHER(EXT, 1)                                                         \
447
    DECL_DITHER(EXT, 2)                                                         \
448
    DECL_DITHER(EXT, 3)                                                         \
449
    DECL_DITHER(EXT, 4)                                                         \
450
    DECL_DITHER(EXT, 5)                                                         \
451
    DECL_DITHER(EXT, 6)                                                         \
452
    DECL_DITHER(EXT, 7)                                                         \
453
    DECL_DITHER(EXT, 8)                                                         \
454
    DECL_LINEAR(EXT, luma,      SWS_MASK_LUMA)                                  \
455
    DECL_LINEAR(EXT, alpha,     SWS_MASK_ALPHA)                                 \
456
    DECL_LINEAR(EXT, lumalpha,  SWS_MASK_LUMA | SWS_MASK_ALPHA)                 \
457
    DECL_LINEAR(EXT, dot3,      0x7)                                            \
458
    DECL_LINEAR(EXT, row0,      SWS_MASK_ROW(0))                                \
459
    DECL_LINEAR(EXT, row0a,     SWS_MASK_ROW(0) | SWS_MASK_ALPHA)               \
460
    DECL_LINEAR(EXT, diag3,     SWS_MASK_DIAG3)                                 \
461
    DECL_LINEAR(EXT, diag4,     SWS_MASK_DIAG4)                                 \
462
    DECL_LINEAR(EXT, diagoff3,  SWS_MASK_DIAG3 | SWS_MASK_OFF3)                 \
463
    DECL_LINEAR(EXT, matrix3,   SWS_MASK_MAT3)                                  \
464
    DECL_LINEAR(EXT, affine3,   SWS_MASK_MAT3 | SWS_MASK_OFF3)                  \
465
    DECL_LINEAR(EXT, affine3a,  SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
466
    DECL_LINEAR(EXT, matrix4,   SWS_MASK_MAT4)                                  \
467
    DECL_LINEAR(EXT, affine4,   SWS_MASK_MAT4 | SWS_MASK_OFF4)                  \
468
                                                                                \
469
static const SwsOpTable ops32##EXT = {                                          \
470
    .cpu_flags = AV_CPU_FLAG_##FLAG,                                            \
471
    .block_size = SIZE,                                                         \
472
    .entries = {                                                                \
473
        &op_read32_packed2_m2##EXT,                                             \
474
        &op_read32_packed3_m2##EXT,                                             \
475
        &op_read32_packed4_m2##EXT,                                             \
476
        &op_write32_packed2_m2##EXT,                                            \
477
        &op_write32_packed3_m2##EXT,                                            \
478
        &op_write32_packed4_m2##EXT,                                            \
479
        &op_pack_1010102_m2##EXT,                                               \
480
        &op_pack_2101010_m2##EXT,                                               \
481
        &op_unpack_1010102_m2##EXT,                                             \
482
        &op_unpack_2101010_m2##EXT,                                             \
483
        REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT),                            \
484
        REF_COMMON_PATTERNS(convert_U8_U32##EXT),                               \
485
        REF_COMMON_PATTERNS(convert_U32_U8##EXT),                               \
486
        REF_COMMON_PATTERNS(convert_U16_U32##EXT),                              \
487
        REF_COMMON_PATTERNS(convert_U32_U16##EXT),                              \
488
        REF_COMMON_PATTERNS(convert_U8_F32##EXT),                               \
489
        REF_COMMON_PATTERNS(convert_F32_U8##EXT),                               \
490
        REF_COMMON_PATTERNS(convert_U16_F32##EXT),                              \
491
        REF_COMMON_PATTERNS(convert_F32_U16##EXT),                              \
492
        REF_COMMON_PATTERNS(expand_U8_U32##EXT),                                \
493
        REF_COMMON_PATTERNS(min##EXT),                                          \
494
        REF_COMMON_PATTERNS(max##EXT),                                          \
495
        REF_COMMON_PATTERNS(scale##EXT),                                        \
496
        REF_COMMON_PATTERNS(dither0##EXT),                                      \
497
        REF_COMMON_PATTERNS(dither1##EXT),                                      \
498
        REF_COMMON_PATTERNS(dither2##EXT),                                      \
499
        REF_COMMON_PATTERNS(dither3##EXT),                                      \
500
        REF_COMMON_PATTERNS(dither4##EXT),                                      \
501
        REF_COMMON_PATTERNS(dither5##EXT),                                      \
502
        REF_COMMON_PATTERNS(dither6##EXT),                                      \
503
        REF_COMMON_PATTERNS(dither7##EXT),                                      \
504
        REF_COMMON_PATTERNS(dither8##EXT),                                      \
505
        &op_luma##EXT,                                                          \
506
        &op_alpha##EXT,                                                         \
507
        &op_lumalpha##EXT,                                                      \
508
        &op_dot3##EXT,                                                          \
509
        &op_row0##EXT,                                                          \
510
        &op_row0a##EXT,                                                         \
511
        &op_diag3##EXT,                                                         \
512
        &op_diag4##EXT,                                                         \
513
        &op_diagoff3##EXT,                                                      \
514
        &op_matrix3##EXT,                                                       \
515
        &op_affine3##EXT,                                                       \
516
        &op_affine3a##EXT,                                                      \
517
        &op_matrix4##EXT,                                                       \
518
        &op_affine4##EXT,                                                       \
519
        NULL                                                                    \
520
    },                                                                          \
521
};
522
523
DECL_FUNCS_8(16, _m1_sse4, SSE4)
524
DECL_FUNCS_8(32, _m1_avx2, AVX2)
525
DECL_FUNCS_8(32, _m2_sse4, SSE4)
526
DECL_FUNCS_8(64, _m2_avx2, AVX2)
527
528
DECL_FUNCS_16(16, _m1_avx2, AVX2)
529
DECL_FUNCS_16(32, _m2_avx2, AVX2)
530
531
DECL_FUNCS_32(16, _avx2, AVX2)
532
533
static av_const int get_mmsize(const int cpu_flags)
534
0
{
535
0
    if (cpu_flags & AV_CPU_FLAG_AVX512)
536
0
        return 64;
537
0
    else if (cpu_flags & AV_CPU_FLAG_AVX2)
538
0
        return 32;
539
0
    else if (cpu_flags & AV_CPU_FLAG_SSE4)
540
0
        return 16;
541
0
    else
542
0
        return AVERROR(ENOTSUP);
543
0
}
544
545
/**
546
 * Returns true if the operation's implementation only depends on the block
547
 * size, and not the underlying pixel type
548
 */
549
static bool op_is_type_invariant(const SwsOp *op)
550
0
{
551
0
    switch (op->op) {
552
0
    case SWS_OP_READ:
553
0
    case SWS_OP_WRITE:
554
0
        return !op->rw.packed && !op->rw.frac;
555
0
    case SWS_OP_SWIZZLE:
556
0
    case SWS_OP_CLEAR:
557
0
        return true;
558
0
    }
559
560
0
    return false;
561
0
}
562
563
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
564
0
{
565
0
    uint8_t shuffle[16];
566
0
    int read_bytes, write_bytes;
567
0
    int pixels;
568
569
    /* Solve the shuffle mask for one 128-bit lane only */
570
0
    pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
571
0
    if (pixels < 0)
572
0
        return pixels;
573
574
    /* We can't shuffle acress lanes, so restrict the vector size to XMM
575
     * whenever the read/write size would be a subset of the full vector */
576
0
    if (read_bytes < 16 || write_bytes < 16)
577
0
        mmsize = 16;
578
579
0
    const int num_lanes = mmsize / 16;
580
0
    const int in_total  = num_lanes * read_bytes;
581
0
    const int out_total = num_lanes * write_bytes;
582
0
    const int read_size = in_total <= 4 ? 4 : /* movd */
583
0
                          in_total <= 8 ? 8 : /* movq */
584
0
                          mmsize;             /* movu */
585
586
0
    *out = (SwsCompiledOp) {
587
0
        .priv       = av_memdup(shuffle, sizeof(shuffle)),
588
0
        .free       = av_free,
589
0
        .block_size = pixels * num_lanes,
590
0
        .over_read  = read_size - in_total,
591
0
        .over_write = mmsize - out_total,
592
0
        .cpu_flags  = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
593
0
                      mmsize > 16 ? AV_CPU_FLAG_AVX2 :
594
0
                                    AV_CPU_FLAG_SSE4,
595
0
    };
596
597
0
    if (!out->priv)
598
0
        return AVERROR(ENOMEM);
599
600
0
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)                                       \
601
0
do {                                                                            \
602
0
    SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT);                       \
603
0
    if (in_total == IN && out_total == OUT)                                     \
604
0
        out->func = ff_packed_shuffle##IN##_##OUT##_##EXT;                      \
605
0
} while (0)
606
607
0
    ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
608
0
    ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
609
0
    ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
610
0
    ASSIGN_SHUFFLE_FUNC(16,  8, sse4);
611
0
    ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
612
0
    ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
613
0
    ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
614
0
    ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
615
0
    ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
616
0
    ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
617
0
    ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
618
0
    ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
619
0
    ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
620
0
    ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
621
0
    ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
622
0
    ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
623
0
    av_assert1(out->func);
624
0
    return 0;
625
0
}
626
627
/* Normalize clear values into 32-bit integer constants */
628
static void normalize_clear(SwsOp *op)
629
0
{
630
0
    static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
631
0
    SwsOpPriv priv;
632
0
    union {
633
0
        uint32_t u32;
634
0
        int i;
635
0
    } c;
636
637
0
    ff_sws_setup_q4(op, &priv);
638
0
    for (int i = 0; i < 4; i++) {
639
0
        if (!op->c.q4[i].den)
640
0
            continue;
641
0
        switch (ff_sws_pixel_type_size(op->type)) {
642
0
        case 1: c.u32 = 0x1010101U * priv.u8[i]; break;
643
0
        case 2: c.u32 = (uint32_t)priv.u16[i] << 16 | priv.u16[i]; break;
644
0
        case 4: c.u32 = priv.u32[i]; break;
645
0
        }
646
647
0
        op->c.q4[i].num = c.i;
648
0
        op->c.q4[i].den = 1;
649
0
    }
650
0
}
651
652
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
653
0
{
654
0
    const int cpu_flags = av_get_cpu_flags();
655
0
    const int mmsize = get_mmsize(cpu_flags);
656
0
    if (mmsize < 0)
657
0
        return mmsize;
658
659
0
    av_assert1(ops->num_ops > 0);
660
0
    const SwsOp read = ops->ops[0];
661
0
    const SwsOp write = ops->ops[ops->num_ops - 1];
662
0
    int ret;
663
664
    /* Special fast path for in-place packed shuffle */
665
0
    ret = solve_shuffle(ops, mmsize, out);
666
0
    if (ret != AVERROR(ENOTSUP))
667
0
        return ret;
668
669
0
    SwsOpChain *chain = ff_sws_op_chain_alloc();
670
0
    if (!chain)
671
0
        return AVERROR(ENOMEM);
672
673
0
    *out = (SwsCompiledOp) {
674
0
        .priv = chain,
675
0
        .free = ff_sws_op_chain_free_cb,
676
677
        /* Use at most two full YMM regs during the widest precision section */
678
0
        .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
679
0
    };
680
681
    /* 3-component reads/writes process one extra garbage word */
682
0
    if (read.rw.packed && read.rw.elems == 3)
683
0
        out->over_read = sizeof(uint32_t);
684
0
    if (write.rw.packed && write.rw.elems == 3)
685
0
        out->over_write = sizeof(uint32_t);
686
687
0
    static const SwsOpTable *const tables[] = {
688
0
        &ops8_m1_sse4,
689
0
        &ops8_m1_avx2,
690
0
        &ops8_m2_sse4,
691
0
        &ops8_m2_avx2,
692
0
        &ops16_m1_avx2,
693
0
        &ops16_m2_avx2,
694
0
        &ops32_avx2,
695
0
    };
696
697
0
    do {
698
0
        int op_block_size = out->block_size;
699
0
        SwsOp *op = &ops->ops[0];
700
701
0
        if (op_is_type_invariant(op)) {
702
0
            if (op->op == SWS_OP_CLEAR)
703
0
                normalize_clear(op);
704
0
            op_block_size *= ff_sws_pixel_type_size(op->type);
705
0
            op->type = SWS_PIXEL_U8;
706
0
        }
707
708
0
        ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops,
709
0
                                       op_block_size, chain);
710
0
    } while (ret == AVERROR(EAGAIN));
711
0
    if (ret < 0) {
712
0
        ff_sws_op_chain_free(chain);
713
0
        return ret;
714
0
    }
715
716
0
#define ASSIGN_PROCESS_FUNC(NAME)                               \
717
0
    do {                                                        \
718
0
        SWS_DECL_FUNC(NAME);                                    \
719
0
        void NAME##_return(void);                               \
720
0
        ret = ff_sws_op_chain_append(chain, NAME##_return,      \
721
0
                                     NULL, &(SwsOpPriv) {0});   \
722
0
        out->func = NAME;                                       \
723
0
    } while (0)
724
725
0
    const int read_planes  = read.rw.packed  ? 1 : read.rw.elems;
726
0
    const int write_planes = write.rw.packed ? 1 : write.rw.elems;
727
0
    switch (FFMAX(read_planes, write_planes)) {
728
0
    case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
729
0
    case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
730
0
    case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
731
0
    case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
732
0
    }
733
734
0
    if (ret < 0) {
735
0
        ff_sws_op_chain_free(chain);
736
0
        return ret;
737
0
    }
738
739
0
    out->cpu_flags = chain->cpu_flags;
740
0
    return 0;
741
0
}
742
743
const SwsOpBackend backend_x86 = {
744
    .name       = "x86",
745
    .compile    = compile,
746
};