/src/ffmpeg/libswscale/x86/ops.c
Line | Count | Source |
1 | | /** |
2 | | * Copyright (C) 2025 Niklas Haas |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include <float.h> |
22 | | |
23 | | #include "libavutil/avassert.h" |
24 | | #include "libavutil/mem.h" |
25 | | |
26 | | #include "../ops_chain.h" |
27 | | |
28 | | #define DECL_ENTRY(TYPE, NAME, ...) \ |
29 | | static const SwsOpEntry op_##NAME = { \ |
30 | | .type = SWS_PIXEL_##TYPE, \ |
31 | | __VA_ARGS__ \ |
32 | | } |
33 | | |
34 | | #define DECL_ASM(TYPE, NAME, ...) \ |
35 | | void ff_##NAME(void); \ |
36 | | DECL_ENTRY(TYPE, NAME, \ |
37 | | .func = ff_##NAME, \ |
38 | | __VA_ARGS__) |
39 | | |
40 | | #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \ |
41 | | DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \ |
42 | | .unused = { !X, !Y, !Z, !W }, \ |
43 | | __VA_ARGS__ \ |
44 | | ) |
45 | | |
46 | | #define REF_PATTERN(NAME, X, Y, Z, W) \ |
47 | | &op_p##X##Y##Z##W##_##NAME |
48 | | |
49 | | #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \ |
50 | | DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \ |
51 | | DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \ |
52 | | DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \ |
53 | | DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \ |
54 | | |
55 | | #define REF_COMMON_PATTERNS(NAME) \ |
56 | | REF_PATTERN(NAME, 1, 0, 0, 0), \ |
57 | | REF_PATTERN(NAME, 1, 0, 0, 1), \ |
58 | | REF_PATTERN(NAME, 1, 1, 1, 0), \ |
59 | | REF_PATTERN(NAME, 1, 1, 1, 1) |
60 | | |
61 | | #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \ |
62 | | DECL_ASM(TYPE, NAME##ELEMS##EXT, \ |
63 | | .op = SWS_OP_##OP, \ |
64 | | .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \ |
65 | | ); |
66 | | |
67 | | #define DECL_PACKED_RW(EXT, DEPTH) \ |
68 | | DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \ |
69 | | DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \ |
70 | | DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \ |
71 | | DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \ |
72 | | DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \ |
73 | | DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \ |
74 | | |
75 | | #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \ |
76 | | DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \ |
77 | | .op = SWS_OP_PACK, \ |
78 | | .pack.pattern = {X, Y, Z, W}, \ |
79 | | ); \ |
80 | | \ |
81 | | DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \ |
82 | | .op = SWS_OP_UNPACK, \ |
83 | | .pack.pattern = {X, Y, Z, W}, \ |
84 | | ); \ |
85 | | |
86 | | static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out) |
87 | 0 | { |
88 | 0 | const int mask = ff_sws_pixel_type_size(op->type) - 1; |
89 | 0 | for (int i = 0; i < 16; i++) |
90 | 0 | out->u8[i] = (i & ~mask) | (mask - (i & mask)); |
91 | 0 | return 0; |
92 | 0 | } |
93 | | |
94 | | #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \ |
95 | | DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \ |
96 | | .op = SWS_OP_SWAP_BYTES, \ |
97 | | .unused = { !X, !Y, !Z, !W }, \ |
98 | | .func = ff_p##X##Y##Z##W##_shuffle##EXT, \ |
99 | | .setup = setup_swap_bytes, \ |
100 | | ); |
101 | | |
102 | | #define DECL_CLEAR_ALPHA(EXT, IDX) \ |
103 | | DECL_ASM(U8, clear_alpha##IDX##EXT, \ |
104 | | .op = SWS_OP_CLEAR, \ |
105 | | .clear_value = -1, \ |
106 | | .unused[IDX] = true, \ |
107 | | ); \ |
108 | | |
109 | | #define DECL_CLEAR_ZERO(EXT, IDX) \ |
110 | | DECL_ASM(U8, clear_zero##IDX##EXT, \ |
111 | | .op = SWS_OP_CLEAR, \ |
112 | | .clear_value = 0, \ |
113 | | .unused[IDX] = true, \ |
114 | | ); |
115 | | |
116 | | static int setup_clear(const SwsOp *op, SwsOpPriv *out) |
117 | 0 | { |
118 | 0 | for (int i = 0; i < 4; i++) |
119 | 0 | out->u32[i] = (uint32_t) op->c.q4[i].num; |
120 | 0 | return 0; |
121 | 0 | } |
122 | | |
123 | | #define DECL_CLEAR(EXT, X, Y, Z, W) \ |
124 | | DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \ |
125 | | .op = SWS_OP_CLEAR, \ |
126 | | .setup = setup_clear, \ |
127 | | .flexible = true, \ |
128 | | ); |
129 | | |
130 | | #define DECL_SWIZZLE(EXT, X, Y, Z, W) \ |
131 | | DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \ |
132 | | .op = SWS_OP_SWIZZLE, \ |
133 | | .swizzle.in = {X, Y, Z, W}, \ |
134 | | ); |
135 | | |
136 | | #define DECL_CONVERT(EXT, FROM, TO) \ |
137 | | DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \ |
138 | | .op = SWS_OP_CONVERT, \ |
139 | | .convert.to = SWS_PIXEL_##TO, \ |
140 | | ); |
141 | | |
142 | | #define DECL_EXPAND(EXT, FROM, TO) \ |
143 | | DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \ |
144 | | .op = SWS_OP_CONVERT, \ |
145 | | .convert.to = SWS_PIXEL_##TO, \ |
146 | | .convert.expand = true, \ |
147 | | ); |
148 | | |
149 | | static int setup_shift(const SwsOp *op, SwsOpPriv *out) |
150 | 0 | { |
151 | 0 | out->u16[0] = op->c.u; |
152 | 0 | return 0; |
153 | 0 | } |
154 | | |
155 | | #define DECL_SHIFT16(EXT) \ |
156 | | DECL_COMMON_PATTERNS(U16, lshift16##EXT, \ |
157 | | .op = SWS_OP_LSHIFT, \ |
158 | | .setup = setup_shift, \ |
159 | | .flexible = true, \ |
160 | | ); \ |
161 | | \ |
162 | | DECL_COMMON_PATTERNS(U16, rshift16##EXT, \ |
163 | | .op = SWS_OP_RSHIFT, \ |
164 | | .setup = setup_shift, \ |
165 | | .flexible = true, \ |
166 | | ); |
167 | | |
168 | | #define DECL_MIN_MAX(EXT) \ |
169 | | DECL_COMMON_PATTERNS(F32, min##EXT, \ |
170 | | .op = SWS_OP_MIN, \ |
171 | | .setup = ff_sws_setup_q4, \ |
172 | | .flexible = true, \ |
173 | | ); \ |
174 | | \ |
175 | | DECL_COMMON_PATTERNS(F32, max##EXT, \ |
176 | | .op = SWS_OP_MAX, \ |
177 | | .setup = ff_sws_setup_q4, \ |
178 | | .flexible = true, \ |
179 | | ); |
180 | | |
181 | | #define DECL_SCALE(EXT) \ |
182 | | DECL_COMMON_PATTERNS(F32, scale##EXT, \ |
183 | | .op = SWS_OP_SCALE, \ |
184 | | .setup = ff_sws_setup_q, \ |
185 | | ); |
186 | | |
187 | | static int setup_dither(const SwsOp *op, SwsOpPriv *out) |
188 | 0 | { |
189 | | /* 1x1 matrix / single constant */ |
190 | 0 | if (!op->dither.size_log2) { |
191 | 0 | const AVRational k = op->dither.matrix[0]; |
192 | 0 | out->f32[0] = (float) k.num / k.den; |
193 | 0 | return 0; |
194 | 0 | } |
195 | | |
196 | 0 | const int size = 1 << op->dither.size_log2; |
197 | 0 | int max_offset = 0; |
198 | 0 | for (int i = 0; i < 4; i++) { |
199 | 0 | const int offset = op->dither.y_offset[i] & (size - 1); |
200 | 0 | max_offset = FFMAX(max_offset, offset); |
201 | 0 | } |
202 | | |
203 | | /* Allocate extra rows to allow over-reading for row offsets. Note that |
204 | | * max_offset is currently never larger than 5, so the extra space needed |
205 | | * for this over-allocation is bounded by 5 * size * sizeof(float), |
206 | | * typically 320 bytes for a 16x16 dither matrix. */ |
207 | 0 | const int stride = size * sizeof(float); |
208 | 0 | const int num_rows = size + max_offset; |
209 | 0 | float *matrix = out->ptr = av_mallocz(num_rows * stride); |
210 | 0 | if (!matrix) |
211 | 0 | return AVERROR(ENOMEM); |
212 | | |
213 | 0 | for (int i = 0; i < size * size; i++) |
214 | 0 | matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den; |
215 | |
|
216 | 0 | memcpy(&matrix[size * size], matrix, max_offset * stride); |
217 | | |
218 | | /* Store relative pointer offset to each row inside extra space */ |
219 | 0 | static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers not supported"); |
220 | 0 | assert(max_offset * stride <= UINT16_MAX); |
221 | 0 | uint16_t *offset = &out->u16[4]; |
222 | 0 | for (int i = 0; i < 4; i++) |
223 | 0 | offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride; |
224 | |
|
225 | 0 | return 0; |
226 | 0 | } |
227 | | |
228 | | #define DECL_DITHER(EXT, SIZE) \ |
229 | | DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT, \ |
230 | | .op = SWS_OP_DITHER, \ |
231 | | .setup = setup_dither, \ |
232 | | .free = (SIZE) ? av_free : NULL, \ |
233 | | .dither_size = SIZE, \ |
234 | | ); |
235 | | |
236 | | static int setup_linear(const SwsOp *op, SwsOpPriv *out) |
237 | 0 | { |
238 | 0 | float *matrix = out->ptr = av_mallocz(sizeof(float[4][5])); |
239 | 0 | if (!matrix) |
240 | 0 | return AVERROR(ENOMEM); |
241 | | |
242 | 0 | for (int y = 0; y < 4; y++) { |
243 | 0 | for (int x = 0; x < 5; x++) |
244 | 0 | matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den; |
245 | 0 | } |
246 | |
|
247 | 0 | return 0; |
248 | 0 | } |
249 | | |
250 | | #define DECL_LINEAR(EXT, NAME, MASK) \ |
251 | | DECL_ASM(F32, NAME##EXT, \ |
252 | | .op = SWS_OP_LINEAR, \ |
253 | | .setup = setup_linear, \ |
254 | | .free = av_free, \ |
255 | | .linear_mask = (MASK), \ |
256 | | ); |
257 | | |
258 | | #define DECL_FUNCS_8(SIZE, EXT, FLAG) \ |
259 | | DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \ |
260 | | DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \ |
261 | | DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \ |
262 | | DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \ |
263 | | DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \ |
264 | | DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \ |
265 | | DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \ |
266 | | DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \ |
267 | | DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \ |
268 | | DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \ |
269 | | DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \ |
270 | | DECL_PACKED_RW(EXT, 8) \ |
271 | | DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \ |
272 | | DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \ |
273 | | DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \ |
274 | | void ff_p1000_shuffle##EXT(void); \ |
275 | | void ff_p1001_shuffle##EXT(void); \ |
276 | | void ff_p1110_shuffle##EXT(void); \ |
277 | | void ff_p1111_shuffle##EXT(void); \ |
278 | | DECL_SWIZZLE(EXT, 3, 0, 1, 2) \ |
279 | | DECL_SWIZZLE(EXT, 3, 0, 2, 1) \ |
280 | | DECL_SWIZZLE(EXT, 2, 1, 0, 3) \ |
281 | | DECL_SWIZZLE(EXT, 3, 2, 1, 0) \ |
282 | | DECL_SWIZZLE(EXT, 3, 1, 0, 2) \ |
283 | | DECL_SWIZZLE(EXT, 3, 2, 0, 1) \ |
284 | | DECL_SWIZZLE(EXT, 1, 2, 0, 3) \ |
285 | | DECL_SWIZZLE(EXT, 1, 0, 2, 3) \ |
286 | | DECL_SWIZZLE(EXT, 2, 0, 1, 3) \ |
287 | | DECL_SWIZZLE(EXT, 2, 3, 1, 0) \ |
288 | | DECL_SWIZZLE(EXT, 2, 1, 3, 0) \ |
289 | | DECL_SWIZZLE(EXT, 1, 2, 3, 0) \ |
290 | | DECL_SWIZZLE(EXT, 1, 3, 2, 0) \ |
291 | | DECL_SWIZZLE(EXT, 0, 2, 1, 3) \ |
292 | | DECL_SWIZZLE(EXT, 0, 2, 3, 1) \ |
293 | | DECL_SWIZZLE(EXT, 0, 3, 1, 2) \ |
294 | | DECL_SWIZZLE(EXT, 3, 1, 2, 0) \ |
295 | | DECL_SWIZZLE(EXT, 0, 3, 2, 1) \ |
296 | | DECL_SWIZZLE(EXT, 0, 0, 0, 3) \ |
297 | | DECL_SWIZZLE(EXT, 3, 0, 0, 0) \ |
298 | | DECL_SWIZZLE(EXT, 0, 0, 0, 1) \ |
299 | | DECL_SWIZZLE(EXT, 1, 0, 0, 0) \ |
300 | | DECL_CLEAR_ALPHA(EXT, 0) \ |
301 | | DECL_CLEAR_ALPHA(EXT, 1) \ |
302 | | DECL_CLEAR_ALPHA(EXT, 3) \ |
303 | | DECL_CLEAR_ZERO(EXT, 0) \ |
304 | | DECL_CLEAR_ZERO(EXT, 1) \ |
305 | | DECL_CLEAR_ZERO(EXT, 3) \ |
306 | | DECL_CLEAR(EXT, 1, 1, 1, 0) \ |
307 | | DECL_CLEAR(EXT, 0, 1, 1, 1) \ |
308 | | DECL_CLEAR(EXT, 0, 0, 1, 1) \ |
309 | | DECL_CLEAR(EXT, 1, 0, 0, 1) \ |
310 | | DECL_CLEAR(EXT, 1, 1, 0, 0) \ |
311 | | DECL_CLEAR(EXT, 0, 1, 0, 1) \ |
312 | | DECL_CLEAR(EXT, 1, 0, 1, 0) \ |
313 | | DECL_CLEAR(EXT, 1, 0, 0, 0) \ |
314 | | DECL_CLEAR(EXT, 0, 1, 0, 0) \ |
315 | | DECL_CLEAR(EXT, 0, 0, 1, 0) \ |
316 | | \ |
317 | | static const SwsOpTable ops8##EXT = { \ |
318 | | .cpu_flags = AV_CPU_FLAG_##FLAG, \ |
319 | | .block_size = SIZE, \ |
320 | | .entries = { \ |
321 | | &op_read_planar1##EXT, \ |
322 | | &op_read_planar2##EXT, \ |
323 | | &op_read_planar3##EXT, \ |
324 | | &op_read_planar4##EXT, \ |
325 | | &op_write_planar1##EXT, \ |
326 | | &op_write_planar2##EXT, \ |
327 | | &op_write_planar3##EXT, \ |
328 | | &op_write_planar4##EXT, \ |
329 | | &op_read8_packed2##EXT, \ |
330 | | &op_read8_packed3##EXT, \ |
331 | | &op_read8_packed4##EXT, \ |
332 | | &op_write8_packed2##EXT, \ |
333 | | &op_write8_packed3##EXT, \ |
334 | | &op_write8_packed4##EXT, \ |
335 | | &op_read_nibbles1##EXT, \ |
336 | | &op_read_bits1##EXT, \ |
337 | | &op_write_bits1##EXT, \ |
338 | | &op_pack_1210##EXT, \ |
339 | | &op_pack_3320##EXT, \ |
340 | | &op_pack_2330##EXT, \ |
341 | | &op_unpack_1210##EXT, \ |
342 | | &op_unpack_3320##EXT, \ |
343 | | &op_unpack_2330##EXT, \ |
344 | | &op_swizzle_3012##EXT, \ |
345 | | &op_swizzle_3021##EXT, \ |
346 | | &op_swizzle_2103##EXT, \ |
347 | | &op_swizzle_3210##EXT, \ |
348 | | &op_swizzle_3102##EXT, \ |
349 | | &op_swizzle_3201##EXT, \ |
350 | | &op_swizzle_1203##EXT, \ |
351 | | &op_swizzle_1023##EXT, \ |
352 | | &op_swizzle_2013##EXT, \ |
353 | | &op_swizzle_2310##EXT, \ |
354 | | &op_swizzle_2130##EXT, \ |
355 | | &op_swizzle_1230##EXT, \ |
356 | | &op_swizzle_1320##EXT, \ |
357 | | &op_swizzle_0213##EXT, \ |
358 | | &op_swizzle_0231##EXT, \ |
359 | | &op_swizzle_0312##EXT, \ |
360 | | &op_swizzle_3120##EXT, \ |
361 | | &op_swizzle_0321##EXT, \ |
362 | | &op_swizzle_0003##EXT, \ |
363 | | &op_swizzle_0001##EXT, \ |
364 | | &op_swizzle_3000##EXT, \ |
365 | | &op_swizzle_1000##EXT, \ |
366 | | &op_clear_alpha0##EXT, \ |
367 | | &op_clear_alpha1##EXT, \ |
368 | | &op_clear_alpha3##EXT, \ |
369 | | &op_clear_zero0##EXT, \ |
370 | | &op_clear_zero1##EXT, \ |
371 | | &op_clear_zero3##EXT, \ |
372 | | REF_PATTERN(clear##EXT, 1, 1, 1, 0), \ |
373 | | REF_PATTERN(clear##EXT, 0, 1, 1, 1), \ |
374 | | REF_PATTERN(clear##EXT, 0, 0, 1, 1), \ |
375 | | REF_PATTERN(clear##EXT, 1, 0, 0, 1), \ |
376 | | REF_PATTERN(clear##EXT, 1, 1, 0, 0), \ |
377 | | REF_PATTERN(clear##EXT, 0, 1, 0, 1), \ |
378 | | REF_PATTERN(clear##EXT, 1, 0, 1, 0), \ |
379 | | REF_PATTERN(clear##EXT, 1, 0, 0, 0), \ |
380 | | REF_PATTERN(clear##EXT, 0, 1, 0, 0), \ |
381 | | REF_PATTERN(clear##EXT, 0, 0, 1, 0), \ |
382 | | NULL \ |
383 | | }, \ |
384 | | }; |
385 | | |
386 | | #define DECL_FUNCS_16(SIZE, EXT, FLAG) \ |
387 | | DECL_PACKED_RW(EXT, 16) \ |
388 | | DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \ |
389 | | DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \ |
390 | | DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \ |
391 | | DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \ |
392 | | DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \ |
393 | | DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \ |
394 | | DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \ |
395 | | DECL_SHIFT16(EXT) \ |
396 | | DECL_CONVERT(EXT, U8, U16) \ |
397 | | DECL_CONVERT(EXT, U16, U8) \ |
398 | | DECL_EXPAND(EXT, U8, U16) \ |
399 | | \ |
400 | | static const SwsOpTable ops16##EXT = { \ |
401 | | .cpu_flags = AV_CPU_FLAG_##FLAG, \ |
402 | | .block_size = SIZE, \ |
403 | | .entries = { \ |
404 | | &op_read16_packed2##EXT, \ |
405 | | &op_read16_packed3##EXT, \ |
406 | | &op_read16_packed4##EXT, \ |
407 | | &op_write16_packed2##EXT, \ |
408 | | &op_write16_packed3##EXT, \ |
409 | | &op_write16_packed4##EXT, \ |
410 | | &op_pack_4440##EXT, \ |
411 | | &op_pack_5550##EXT, \ |
412 | | &op_pack_5650##EXT, \ |
413 | | &op_unpack_4440##EXT, \ |
414 | | &op_unpack_5550##EXT, \ |
415 | | &op_unpack_5650##EXT, \ |
416 | | REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \ |
417 | | REF_COMMON_PATTERNS(convert_U8_U16##EXT), \ |
418 | | REF_COMMON_PATTERNS(convert_U16_U8##EXT), \ |
419 | | REF_COMMON_PATTERNS(expand_U8_U16##EXT), \ |
420 | | REF_COMMON_PATTERNS(lshift16##EXT), \ |
421 | | REF_COMMON_PATTERNS(rshift16##EXT), \ |
422 | | NULL \ |
423 | | }, \ |
424 | | }; |
425 | | |
426 | | #define DECL_FUNCS_32(SIZE, EXT, FLAG) \ |
427 | | DECL_PACKED_RW(_m2##EXT, 32) \ |
428 | | DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \ |
429 | | DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \ |
430 | | DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \ |
431 | | DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \ |
432 | | DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \ |
433 | | DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \ |
434 | | DECL_CONVERT(EXT, U8, U32) \ |
435 | | DECL_CONVERT(EXT, U32, U8) \ |
436 | | DECL_CONVERT(EXT, U16, U32) \ |
437 | | DECL_CONVERT(EXT, U32, U16) \ |
438 | | DECL_CONVERT(EXT, U8, F32) \ |
439 | | DECL_CONVERT(EXT, F32, U8) \ |
440 | | DECL_CONVERT(EXT, U16, F32) \ |
441 | | DECL_CONVERT(EXT, F32, U16) \ |
442 | | DECL_EXPAND(EXT, U8, U32) \ |
443 | | DECL_MIN_MAX(EXT) \ |
444 | | DECL_SCALE(EXT) \ |
445 | | DECL_DITHER(EXT, 0) \ |
446 | | DECL_DITHER(EXT, 1) \ |
447 | | DECL_DITHER(EXT, 2) \ |
448 | | DECL_DITHER(EXT, 3) \ |
449 | | DECL_DITHER(EXT, 4) \ |
450 | | DECL_DITHER(EXT, 5) \ |
451 | | DECL_DITHER(EXT, 6) \ |
452 | | DECL_DITHER(EXT, 7) \ |
453 | | DECL_DITHER(EXT, 8) \ |
454 | | DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \ |
455 | | DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \ |
456 | | DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \ |
457 | | DECL_LINEAR(EXT, dot3, 0x7) \ |
458 | | DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \ |
459 | | DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \ |
460 | | DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \ |
461 | | DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \ |
462 | | DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \ |
463 | | DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \ |
464 | | DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \ |
465 | | DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \ |
466 | | DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \ |
467 | | DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \ |
468 | | \ |
469 | | static const SwsOpTable ops32##EXT = { \ |
470 | | .cpu_flags = AV_CPU_FLAG_##FLAG, \ |
471 | | .block_size = SIZE, \ |
472 | | .entries = { \ |
473 | | &op_read32_packed2_m2##EXT, \ |
474 | | &op_read32_packed3_m2##EXT, \ |
475 | | &op_read32_packed4_m2##EXT, \ |
476 | | &op_write32_packed2_m2##EXT, \ |
477 | | &op_write32_packed3_m2##EXT, \ |
478 | | &op_write32_packed4_m2##EXT, \ |
479 | | &op_pack_1010102_m2##EXT, \ |
480 | | &op_pack_2101010_m2##EXT, \ |
481 | | &op_unpack_1010102_m2##EXT, \ |
482 | | &op_unpack_2101010_m2##EXT, \ |
483 | | REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \ |
484 | | REF_COMMON_PATTERNS(convert_U8_U32##EXT), \ |
485 | | REF_COMMON_PATTERNS(convert_U32_U8##EXT), \ |
486 | | REF_COMMON_PATTERNS(convert_U16_U32##EXT), \ |
487 | | REF_COMMON_PATTERNS(convert_U32_U16##EXT), \ |
488 | | REF_COMMON_PATTERNS(convert_U8_F32##EXT), \ |
489 | | REF_COMMON_PATTERNS(convert_F32_U8##EXT), \ |
490 | | REF_COMMON_PATTERNS(convert_U16_F32##EXT), \ |
491 | | REF_COMMON_PATTERNS(convert_F32_U16##EXT), \ |
492 | | REF_COMMON_PATTERNS(expand_U8_U32##EXT), \ |
493 | | REF_COMMON_PATTERNS(min##EXT), \ |
494 | | REF_COMMON_PATTERNS(max##EXT), \ |
495 | | REF_COMMON_PATTERNS(scale##EXT), \ |
496 | | REF_COMMON_PATTERNS(dither0##EXT), \ |
497 | | REF_COMMON_PATTERNS(dither1##EXT), \ |
498 | | REF_COMMON_PATTERNS(dither2##EXT), \ |
499 | | REF_COMMON_PATTERNS(dither3##EXT), \ |
500 | | REF_COMMON_PATTERNS(dither4##EXT), \ |
501 | | REF_COMMON_PATTERNS(dither5##EXT), \ |
502 | | REF_COMMON_PATTERNS(dither6##EXT), \ |
503 | | REF_COMMON_PATTERNS(dither7##EXT), \ |
504 | | REF_COMMON_PATTERNS(dither8##EXT), \ |
505 | | &op_luma##EXT, \ |
506 | | &op_alpha##EXT, \ |
507 | | &op_lumalpha##EXT, \ |
508 | | &op_dot3##EXT, \ |
509 | | &op_row0##EXT, \ |
510 | | &op_row0a##EXT, \ |
511 | | &op_diag3##EXT, \ |
512 | | &op_diag4##EXT, \ |
513 | | &op_diagoff3##EXT, \ |
514 | | &op_matrix3##EXT, \ |
515 | | &op_affine3##EXT, \ |
516 | | &op_affine3a##EXT, \ |
517 | | &op_matrix4##EXT, \ |
518 | | &op_affine4##EXT, \ |
519 | | NULL \ |
520 | | }, \ |
521 | | }; |
522 | | |
523 | | DECL_FUNCS_8(16, _m1_sse4, SSE4) |
524 | | DECL_FUNCS_8(32, _m1_avx2, AVX2) |
525 | | DECL_FUNCS_8(32, _m2_sse4, SSE4) |
526 | | DECL_FUNCS_8(64, _m2_avx2, AVX2) |
527 | | |
528 | | DECL_FUNCS_16(16, _m1_avx2, AVX2) |
529 | | DECL_FUNCS_16(32, _m2_avx2, AVX2) |
530 | | |
531 | | DECL_FUNCS_32(16, _avx2, AVX2) |
532 | | |
533 | | static av_const int get_mmsize(const int cpu_flags) |
534 | 0 | { |
535 | 0 | if (cpu_flags & AV_CPU_FLAG_AVX512) |
536 | 0 | return 64; |
537 | 0 | else if (cpu_flags & AV_CPU_FLAG_AVX2) |
538 | 0 | return 32; |
539 | 0 | else if (cpu_flags & AV_CPU_FLAG_SSE4) |
540 | 0 | return 16; |
541 | 0 | else |
542 | 0 | return AVERROR(ENOTSUP); |
543 | 0 | } |
544 | | |
545 | | /** |
546 | | * Returns true if the operation's implementation only depends on the block |
547 | | * size, and not the underlying pixel type |
548 | | */ |
549 | | static bool op_is_type_invariant(const SwsOp *op) |
550 | 0 | { |
551 | 0 | switch (op->op) { |
552 | 0 | case SWS_OP_READ: |
553 | 0 | case SWS_OP_WRITE: |
554 | 0 | return !op->rw.packed && !op->rw.frac; |
555 | 0 | case SWS_OP_SWIZZLE: |
556 | 0 | case SWS_OP_CLEAR: |
557 | 0 | return true; |
558 | 0 | } |
559 | | |
560 | 0 | return false; |
561 | 0 | } |
562 | | |
563 | | static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out) |
564 | 0 | { |
565 | 0 | uint8_t shuffle[16]; |
566 | 0 | int read_bytes, write_bytes; |
567 | 0 | int pixels; |
568 | | |
569 | | /* Solve the shuffle mask for one 128-bit lane only */ |
570 | 0 | pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes); |
571 | 0 | if (pixels < 0) |
572 | 0 | return pixels; |
573 | | |
574 | | /* We can't shuffle acress lanes, so restrict the vector size to XMM |
575 | | * whenever the read/write size would be a subset of the full vector */ |
576 | 0 | if (read_bytes < 16 || write_bytes < 16) |
577 | 0 | mmsize = 16; |
578 | |
|
579 | 0 | const int num_lanes = mmsize / 16; |
580 | 0 | const int in_total = num_lanes * read_bytes; |
581 | 0 | const int out_total = num_lanes * write_bytes; |
582 | 0 | const int read_size = in_total <= 4 ? 4 : /* movd */ |
583 | 0 | in_total <= 8 ? 8 : /* movq */ |
584 | 0 | mmsize; /* movu */ |
585 | |
|
586 | 0 | *out = (SwsCompiledOp) { |
587 | 0 | .priv = av_memdup(shuffle, sizeof(shuffle)), |
588 | 0 | .free = av_free, |
589 | 0 | .block_size = pixels * num_lanes, |
590 | 0 | .over_read = read_size - in_total, |
591 | 0 | .over_write = mmsize - out_total, |
592 | 0 | .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 : |
593 | 0 | mmsize > 16 ? AV_CPU_FLAG_AVX2 : |
594 | 0 | AV_CPU_FLAG_SSE4, |
595 | 0 | }; |
596 | |
|
597 | 0 | if (!out->priv) |
598 | 0 | return AVERROR(ENOMEM); |
599 | | |
600 | 0 | #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \ |
601 | 0 | do { \ |
602 | 0 | SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \ |
603 | 0 | if (in_total == IN && out_total == OUT) \ |
604 | 0 | out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \ |
605 | 0 | } while (0) |
606 | | |
607 | 0 | ASSIGN_SHUFFLE_FUNC( 5, 15, sse4); |
608 | 0 | ASSIGN_SHUFFLE_FUNC( 4, 16, sse4); |
609 | 0 | ASSIGN_SHUFFLE_FUNC( 2, 12, sse4); |
610 | 0 | ASSIGN_SHUFFLE_FUNC(16, 8, sse4); |
611 | 0 | ASSIGN_SHUFFLE_FUNC(10, 15, sse4); |
612 | 0 | ASSIGN_SHUFFLE_FUNC( 8, 16, sse4); |
613 | 0 | ASSIGN_SHUFFLE_FUNC( 4, 12, sse4); |
614 | 0 | ASSIGN_SHUFFLE_FUNC(15, 15, sse4); |
615 | 0 | ASSIGN_SHUFFLE_FUNC(12, 16, sse4); |
616 | 0 | ASSIGN_SHUFFLE_FUNC( 6, 12, sse4); |
617 | 0 | ASSIGN_SHUFFLE_FUNC(16, 12, sse4); |
618 | 0 | ASSIGN_SHUFFLE_FUNC(16, 16, sse4); |
619 | 0 | ASSIGN_SHUFFLE_FUNC( 8, 12, sse4); |
620 | 0 | ASSIGN_SHUFFLE_FUNC(12, 12, sse4); |
621 | 0 | ASSIGN_SHUFFLE_FUNC(32, 32, avx2); |
622 | 0 | ASSIGN_SHUFFLE_FUNC(64, 64, avx512); |
623 | 0 | av_assert1(out->func); |
624 | 0 | return 0; |
625 | 0 | } |
626 | | |
627 | | /* Normalize clear values into 32-bit integer constants */ |
628 | | static void normalize_clear(SwsOp *op) |
629 | 0 | { |
630 | 0 | static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch"); |
631 | 0 | SwsOpPriv priv; |
632 | 0 | union { |
633 | 0 | uint32_t u32; |
634 | 0 | int i; |
635 | 0 | } c; |
636 | |
|
637 | 0 | ff_sws_setup_q4(op, &priv); |
638 | 0 | for (int i = 0; i < 4; i++) { |
639 | 0 | if (!op->c.q4[i].den) |
640 | 0 | continue; |
641 | 0 | switch (ff_sws_pixel_type_size(op->type)) { |
642 | 0 | case 1: c.u32 = 0x1010101U * priv.u8[i]; break; |
643 | 0 | case 2: c.u32 = (uint32_t)priv.u16[i] << 16 | priv.u16[i]; break; |
644 | 0 | case 4: c.u32 = priv.u32[i]; break; |
645 | 0 | } |
646 | | |
647 | 0 | op->c.q4[i].num = c.i; |
648 | 0 | op->c.q4[i].den = 1; |
649 | 0 | } |
650 | 0 | } |
651 | | |
652 | | static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) |
653 | 0 | { |
654 | 0 | const int cpu_flags = av_get_cpu_flags(); |
655 | 0 | const int mmsize = get_mmsize(cpu_flags); |
656 | 0 | if (mmsize < 0) |
657 | 0 | return mmsize; |
658 | | |
659 | 0 | av_assert1(ops->num_ops > 0); |
660 | 0 | const SwsOp read = ops->ops[0]; |
661 | 0 | const SwsOp write = ops->ops[ops->num_ops - 1]; |
662 | 0 | int ret; |
663 | | |
664 | | /* Special fast path for in-place packed shuffle */ |
665 | 0 | ret = solve_shuffle(ops, mmsize, out); |
666 | 0 | if (ret != AVERROR(ENOTSUP)) |
667 | 0 | return ret; |
668 | | |
669 | 0 | SwsOpChain *chain = ff_sws_op_chain_alloc(); |
670 | 0 | if (!chain) |
671 | 0 | return AVERROR(ENOMEM); |
672 | | |
673 | 0 | *out = (SwsCompiledOp) { |
674 | 0 | .priv = chain, |
675 | 0 | .free = ff_sws_op_chain_free_cb, |
676 | | |
677 | | /* Use at most two full YMM regs during the widest precision section */ |
678 | 0 | .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops), |
679 | 0 | }; |
680 | | |
681 | | /* 3-component reads/writes process one extra garbage word */ |
682 | 0 | if (read.rw.packed && read.rw.elems == 3) |
683 | 0 | out->over_read = sizeof(uint32_t); |
684 | 0 | if (write.rw.packed && write.rw.elems == 3) |
685 | 0 | out->over_write = sizeof(uint32_t); |
686 | |
|
687 | 0 | static const SwsOpTable *const tables[] = { |
688 | 0 | &ops8_m1_sse4, |
689 | 0 | &ops8_m1_avx2, |
690 | 0 | &ops8_m2_sse4, |
691 | 0 | &ops8_m2_avx2, |
692 | 0 | &ops16_m1_avx2, |
693 | 0 | &ops16_m2_avx2, |
694 | 0 | &ops32_avx2, |
695 | 0 | }; |
696 | |
|
697 | 0 | do { |
698 | 0 | int op_block_size = out->block_size; |
699 | 0 | SwsOp *op = &ops->ops[0]; |
700 | |
|
701 | 0 | if (op_is_type_invariant(op)) { |
702 | 0 | if (op->op == SWS_OP_CLEAR) |
703 | 0 | normalize_clear(op); |
704 | 0 | op_block_size *= ff_sws_pixel_type_size(op->type); |
705 | 0 | op->type = SWS_PIXEL_U8; |
706 | 0 | } |
707 | |
|
708 | 0 | ret = ff_sws_op_compile_tables(tables, FF_ARRAY_ELEMS(tables), ops, |
709 | 0 | op_block_size, chain); |
710 | 0 | } while (ret == AVERROR(EAGAIN)); |
711 | 0 | if (ret < 0) { |
712 | 0 | ff_sws_op_chain_free(chain); |
713 | 0 | return ret; |
714 | 0 | } |
715 | | |
716 | 0 | #define ASSIGN_PROCESS_FUNC(NAME) \ |
717 | 0 | do { \ |
718 | 0 | SWS_DECL_FUNC(NAME); \ |
719 | 0 | void NAME##_return(void); \ |
720 | 0 | ret = ff_sws_op_chain_append(chain, NAME##_return, \ |
721 | 0 | NULL, &(SwsOpPriv) {0}); \ |
722 | 0 | out->func = NAME; \ |
723 | 0 | } while (0) |
724 | | |
725 | 0 | const int read_planes = read.rw.packed ? 1 : read.rw.elems; |
726 | 0 | const int write_planes = write.rw.packed ? 1 : write.rw.elems; |
727 | 0 | switch (FFMAX(read_planes, write_planes)) { |
728 | 0 | case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break; |
729 | 0 | case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break; |
730 | 0 | case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break; |
731 | 0 | case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break; |
732 | 0 | } |
733 | | |
734 | 0 | if (ret < 0) { |
735 | 0 | ff_sws_op_chain_free(chain); |
736 | 0 | return ret; |
737 | 0 | } |
738 | | |
739 | 0 | out->cpu_flags = chain->cpu_flags; |
740 | 0 | return 0; |
741 | 0 | } |
742 | | |
743 | | const SwsOpBackend backend_x86 = { |
744 | | .name = "x86", |
745 | | .compile = compile, |
746 | | }; |