/src/ffmpeg/libswscale/ops.c
Line | Count | Source |
1 | | /** |
2 | | * Copyright (C) 2025 Niklas Haas |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include "libavutil/avassert.h" |
22 | | #include "libavutil/bswap.h" |
23 | | #include "libavutil/mem.h" |
24 | | #include "libavutil/rational.h" |
25 | | #include "libavutil/refstruct.h" |
26 | | |
27 | | #include "ops.h" |
28 | | #include "ops_internal.h" |
29 | | |
30 | | extern const SwsOpBackend backend_c; |
31 | | extern const SwsOpBackend backend_murder; |
32 | | extern const SwsOpBackend backend_x86; |
33 | | |
34 | | const SwsOpBackend * const ff_sws_op_backends[] = { |
35 | | &backend_murder, |
36 | | #if ARCH_X86_64 && HAVE_X86ASM |
37 | | &backend_x86, |
38 | | #endif |
39 | | &backend_c, |
40 | | NULL |
41 | | }; |
42 | | |
43 | | #define RET(x) \ |
44 | 0 | do { \ |
45 | 0 | if ((ret = (x)) < 0) \ |
46 | 0 | return ret; \ |
47 | 0 | } while (0) |
48 | | |
49 | | const char *ff_sws_pixel_type_name(SwsPixelType type) |
50 | 0 | { |
51 | 0 | switch (type) { |
52 | 0 | case SWS_PIXEL_U8: return "u8"; |
53 | 0 | case SWS_PIXEL_U16: return "u16"; |
54 | 0 | case SWS_PIXEL_U32: return "u32"; |
55 | 0 | case SWS_PIXEL_F32: return "f32"; |
56 | 0 | case SWS_PIXEL_NONE: return "none"; |
57 | 0 | case SWS_PIXEL_TYPE_NB: break; |
58 | 0 | } |
59 | | |
60 | 0 | av_unreachable("Invalid pixel type!"); |
61 | 0 | return "ERR"; |
62 | 0 | } |
63 | | |
64 | | int ff_sws_pixel_type_size(SwsPixelType type) |
65 | 0 | { |
66 | 0 | switch (type) { |
67 | 0 | case SWS_PIXEL_U8: return sizeof(uint8_t); |
68 | 0 | case SWS_PIXEL_U16: return sizeof(uint16_t); |
69 | 0 | case SWS_PIXEL_U32: return sizeof(uint32_t); |
70 | 0 | case SWS_PIXEL_F32: return sizeof(float); |
71 | 0 | case SWS_PIXEL_NONE: break; |
72 | 0 | case SWS_PIXEL_TYPE_NB: break; |
73 | 0 | } |
74 | | |
75 | 0 | av_unreachable("Invalid pixel type!"); |
76 | 0 | return 0; |
77 | 0 | } |
78 | | |
79 | | bool ff_sws_pixel_type_is_int(SwsPixelType type) |
80 | 0 | { |
81 | 0 | switch (type) { |
82 | 0 | case SWS_PIXEL_U8: |
83 | 0 | case SWS_PIXEL_U16: |
84 | 0 | case SWS_PIXEL_U32: |
85 | 0 | return true; |
86 | 0 | case SWS_PIXEL_F32: |
87 | 0 | return false; |
88 | 0 | case SWS_PIXEL_NONE: |
89 | 0 | case SWS_PIXEL_TYPE_NB: break; |
90 | 0 | } |
91 | | |
92 | 0 | av_unreachable("Invalid pixel type!"); |
93 | 0 | return false; |
94 | 0 | } |
95 | | |
96 | | /* biased towards `a` */ |
97 | | static AVRational av_min_q(AVRational a, AVRational b) |
98 | 0 | { |
99 | 0 | return av_cmp_q(a, b) == 1 ? b : a; |
100 | 0 | } |
101 | | |
102 | | static AVRational av_max_q(AVRational a, AVRational b) |
103 | 0 | { |
104 | 0 | return av_cmp_q(a, b) == -1 ? b : a; |
105 | 0 | } |
106 | | |
107 | | void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4]) |
108 | 0 | { |
109 | 0 | uint64_t mask[4]; |
110 | 0 | int shift[4]; |
111 | |
|
112 | 0 | switch (op->op) { |
113 | 0 | case SWS_OP_READ: |
114 | 0 | case SWS_OP_WRITE: |
115 | 0 | return; |
116 | 0 | case SWS_OP_UNPACK: { |
117 | 0 | av_assert1(ff_sws_pixel_type_is_int(op->type)); |
118 | 0 | ff_sws_pack_op_decode(op, mask, shift); |
119 | 0 | unsigned val = x[0].num; |
120 | 0 | for (int i = 0; i < 4; i++) |
121 | 0 | x[i] = Q((val >> shift[i]) & mask[i]); |
122 | 0 | return; |
123 | 0 | } |
124 | 0 | case SWS_OP_PACK: { |
125 | 0 | av_assert1(ff_sws_pixel_type_is_int(op->type)); |
126 | 0 | ff_sws_pack_op_decode(op, mask, shift); |
127 | 0 | unsigned val = 0; |
128 | 0 | for (int i = 0; i < 4; i++) |
129 | 0 | val |= (x[i].num & mask[i]) << shift[i]; |
130 | 0 | x[0] = Q(val); |
131 | 0 | return; |
132 | 0 | } |
133 | 0 | case SWS_OP_SWAP_BYTES: |
134 | 0 | av_assert1(ff_sws_pixel_type_is_int(op->type)); |
135 | 0 | switch (ff_sws_pixel_type_size(op->type)) { |
136 | 0 | case 2: |
137 | 0 | for (int i = 0; i < 4; i++) |
138 | 0 | x[i].num = av_bswap16(x[i].num); |
139 | 0 | break; |
140 | 0 | case 4: |
141 | 0 | for (int i = 0; i < 4; i++) |
142 | 0 | x[i].num = av_bswap32(x[i].num); |
143 | 0 | break; |
144 | 0 | } |
145 | 0 | return; |
146 | 0 | case SWS_OP_CLEAR: |
147 | 0 | for (int i = 0; i < 4; i++) { |
148 | 0 | if (op->c.q4[i].den) |
149 | 0 | x[i] = op->c.q4[i]; |
150 | 0 | } |
151 | 0 | return; |
152 | 0 | case SWS_OP_LSHIFT: { |
153 | 0 | av_assert1(ff_sws_pixel_type_is_int(op->type)); |
154 | 0 | AVRational mult = Q(1 << op->c.u); |
155 | 0 | for (int i = 0; i < 4; i++) |
156 | 0 | x[i] = x[i].den ? av_mul_q(x[i], mult) : x[i]; |
157 | 0 | return; |
158 | 0 | } |
159 | 0 | case SWS_OP_RSHIFT: { |
160 | 0 | av_assert1(ff_sws_pixel_type_is_int(op->type)); |
161 | 0 | for (int i = 0; i < 4; i++) |
162 | 0 | x[i] = x[i].den ? Q((x[i].num / x[i].den) >> op->c.u) : x[i]; |
163 | 0 | return; |
164 | 0 | } |
165 | 0 | case SWS_OP_SWIZZLE: { |
166 | 0 | const AVRational orig[4] = { x[0], x[1], x[2], x[3] }; |
167 | 0 | for (int i = 0; i < 4; i++) |
168 | 0 | x[i] = orig[op->swizzle.in[i]]; |
169 | 0 | return; |
170 | 0 | } |
171 | 0 | case SWS_OP_CONVERT: |
172 | 0 | if (ff_sws_pixel_type_is_int(op->convert.to)) { |
173 | 0 | const AVRational scale = ff_sws_pixel_expand(op->type, op->convert.to); |
174 | 0 | for (int i = 0; i < 4; i++) { |
175 | 0 | x[i] = x[i].den ? Q(x[i].num / x[i].den) : x[i]; |
176 | 0 | if (op->convert.expand) |
177 | 0 | x[i] = av_mul_q(x[i], scale); |
178 | 0 | } |
179 | 0 | } |
180 | 0 | return; |
181 | 0 | case SWS_OP_DITHER: |
182 | 0 | av_assert1(!ff_sws_pixel_type_is_int(op->type)); |
183 | 0 | for (int i = 0; i < 4; i++) |
184 | 0 | x[i] = x[i].den ? av_add_q(x[i], av_make_q(1, 2)) : x[i]; |
185 | 0 | return; |
186 | 0 | case SWS_OP_MIN: |
187 | 0 | for (int i = 0; i < 4; i++) |
188 | 0 | x[i] = av_min_q(x[i], op->c.q4[i]); |
189 | 0 | return; |
190 | 0 | case SWS_OP_MAX: |
191 | 0 | for (int i = 0; i < 4; i++) |
192 | 0 | x[i] = av_max_q(x[i], op->c.q4[i]); |
193 | 0 | return; |
194 | 0 | case SWS_OP_LINEAR: { |
195 | 0 | av_assert1(!ff_sws_pixel_type_is_int(op->type)); |
196 | 0 | const AVRational orig[4] = { x[0], x[1], x[2], x[3] }; |
197 | 0 | for (int i = 0; i < 4; i++) { |
198 | 0 | AVRational sum = op->lin.m[i][4]; |
199 | 0 | for (int j = 0; j < 4; j++) |
200 | 0 | sum = av_add_q(sum, av_mul_q(orig[j], op->lin.m[i][j])); |
201 | 0 | x[i] = sum; |
202 | 0 | } |
203 | 0 | return; |
204 | 0 | } |
205 | 0 | case SWS_OP_SCALE: |
206 | 0 | for (int i = 0; i < 4; i++) |
207 | 0 | x[i] = x[i].den ? av_mul_q(x[i], op->c.q) : x[i]; |
208 | 0 | return; |
209 | 0 | } |
210 | | |
211 | 0 | av_unreachable("Invalid operation type!"); |
212 | 0 | } |
213 | | |
214 | | /* merge_comp_flags() forms a monoid with flags_identity as the null element */ |
215 | | static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT; |
216 | | static unsigned merge_comp_flags(unsigned a, unsigned b) |
217 | 0 | { |
218 | 0 | const unsigned flags_or = SWS_COMP_GARBAGE; |
219 | 0 | const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT; |
220 | 0 | return ((a & b) & flags_and) | ((a | b) & flags_or); |
221 | 0 | } |
222 | | |
223 | | /* Infer + propagate known information about components */ |
224 | | void ff_sws_op_list_update_comps(SwsOpList *ops) |
225 | 0 | { |
226 | 0 | SwsComps next = { .unused = {true, true, true, true} }; |
227 | 0 | SwsComps prev = { .flags = { |
228 | 0 | SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, |
229 | 0 | }}; |
230 | | |
231 | | /* Forwards pass, propagates knowledge about the incoming pixel values */ |
232 | 0 | for (int n = 0; n < ops->num_ops; n++) { |
233 | 0 | SwsOp *op = &ops->ops[n]; |
234 | |
|
235 | 0 | switch (op->op) { |
236 | 0 | case SWS_OP_READ: |
237 | 0 | case SWS_OP_LINEAR: |
238 | 0 | case SWS_OP_SWAP_BYTES: |
239 | 0 | case SWS_OP_UNPACK: |
240 | 0 | break; /* special cases, handled below */ |
241 | 0 | default: |
242 | 0 | memcpy(op->comps.min, prev.min, sizeof(prev.min)); |
243 | 0 | memcpy(op->comps.max, prev.max, sizeof(prev.max)); |
244 | 0 | ff_sws_apply_op_q(op, op->comps.min); |
245 | 0 | ff_sws_apply_op_q(op, op->comps.max); |
246 | 0 | break; |
247 | 0 | } |
248 | | |
249 | 0 | switch (op->op) { |
250 | 0 | case SWS_OP_READ: |
251 | | /* Active components are preserved from the user-provided value, |
252 | | * other components are explicitly stripped */ |
253 | 0 | for (int i = op->rw.elems; i < 4; i++) { |
254 | 0 | op->comps.flags[i] = prev.flags[i]; |
255 | 0 | op->comps.min[i] = prev.min[i]; |
256 | 0 | op->comps.max[i] = prev.max[i]; |
257 | 0 | } |
258 | 0 | break; |
259 | 0 | case SWS_OP_SWAP_BYTES: |
260 | 0 | for (int i = 0; i < 4; i++) { |
261 | 0 | op->comps.flags[i] = prev.flags[i] ^ SWS_COMP_SWAPPED; |
262 | 0 | op->comps.min[i] = prev.min[i]; |
263 | 0 | op->comps.max[i] = prev.max[i]; |
264 | 0 | } |
265 | 0 | break; |
266 | 0 | case SWS_OP_WRITE: |
267 | 0 | for (int i = 0; i < op->rw.elems; i++) |
268 | 0 | av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE)); |
269 | | /* fall through */ |
270 | 0 | case SWS_OP_LSHIFT: |
271 | 0 | case SWS_OP_RSHIFT: |
272 | 0 | case SWS_OP_MIN: |
273 | 0 | case SWS_OP_MAX: |
274 | | /* Linearly propagate flags per component */ |
275 | 0 | for (int i = 0; i < 4; i++) |
276 | 0 | op->comps.flags[i] = prev.flags[i]; |
277 | 0 | break; |
278 | 0 | case SWS_OP_DITHER: |
279 | | /* Strip zero flag because of the nonzero dithering offset */ |
280 | 0 | for (int i = 0; i < 4; i++) |
281 | 0 | op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO; |
282 | 0 | break; |
283 | 0 | case SWS_OP_UNPACK: |
284 | 0 | for (int i = 0; i < 4; i++) { |
285 | 0 | const int pattern = op->pack.pattern[i]; |
286 | 0 | if (pattern) { |
287 | 0 | av_assert1(pattern < 32); |
288 | 0 | op->comps.flags[i] = prev.flags[0]; |
289 | 0 | op->comps.min[i] = Q(0); |
290 | 0 | op->comps.max[i] = Q((1ULL << pattern) - 1); |
291 | 0 | } else |
292 | 0 | op->comps.flags[i] = SWS_COMP_GARBAGE; |
293 | 0 | } |
294 | 0 | break; |
295 | 0 | case SWS_OP_PACK: { |
296 | 0 | unsigned flags = flags_identity; |
297 | 0 | for (int i = 0; i < 4; i++) { |
298 | 0 | if (op->pack.pattern[i]) |
299 | 0 | flags = merge_comp_flags(flags, prev.flags[i]); |
300 | 0 | if (i > 0) /* clear remaining comps for sanity */ |
301 | 0 | op->comps.flags[i] = SWS_COMP_GARBAGE; |
302 | 0 | } |
303 | 0 | op->comps.flags[0] = flags; |
304 | 0 | break; |
305 | 0 | } |
306 | 0 | case SWS_OP_CLEAR: |
307 | 0 | for (int i = 0; i < 4; i++) { |
308 | 0 | if (op->c.q4[i].den) { |
309 | 0 | if (op->c.q4[i].num == 0) { |
310 | 0 | op->comps.flags[i] = SWS_COMP_ZERO | SWS_COMP_EXACT; |
311 | 0 | } else if (op->c.q4[i].den == 1) { |
312 | 0 | op->comps.flags[i] = SWS_COMP_EXACT; |
313 | 0 | } |
314 | 0 | } else { |
315 | 0 | op->comps.flags[i] = prev.flags[i]; |
316 | 0 | } |
317 | 0 | } |
318 | 0 | break; |
319 | 0 | case SWS_OP_SWIZZLE: |
320 | 0 | for (int i = 0; i < 4; i++) |
321 | 0 | op->comps.flags[i] = prev.flags[op->swizzle.in[i]]; |
322 | 0 | break; |
323 | 0 | case SWS_OP_CONVERT: |
324 | 0 | for (int i = 0; i < 4; i++) { |
325 | 0 | op->comps.flags[i] = prev.flags[i]; |
326 | 0 | if (ff_sws_pixel_type_is_int(op->convert.to)) |
327 | 0 | op->comps.flags[i] |= SWS_COMP_EXACT; |
328 | 0 | } |
329 | 0 | break; |
330 | 0 | case SWS_OP_LINEAR: |
331 | 0 | for (int i = 0; i < 4; i++) { |
332 | 0 | unsigned flags = flags_identity; |
333 | 0 | AVRational min = Q(0), max = Q(0); |
334 | 0 | for (int j = 0; j < 4; j++) { |
335 | 0 | const AVRational k = op->lin.m[i][j]; |
336 | 0 | AVRational mink = av_mul_q(prev.min[j], k); |
337 | 0 | AVRational maxk = av_mul_q(prev.max[j], k); |
338 | 0 | if (k.num) { |
339 | 0 | flags = merge_comp_flags(flags, prev.flags[j]); |
340 | 0 | if (k.den != 1) /* fractional coefficient */ |
341 | 0 | flags &= ~SWS_COMP_EXACT; |
342 | 0 | if (k.num < 0) |
343 | 0 | FFSWAP(AVRational, mink, maxk); |
344 | 0 | min = av_add_q(min, mink); |
345 | 0 | max = av_add_q(max, maxk); |
346 | 0 | } |
347 | 0 | } |
348 | 0 | if (op->lin.m[i][4].num) { /* nonzero offset */ |
349 | 0 | flags &= ~SWS_COMP_ZERO; |
350 | 0 | if (op->lin.m[i][4].den != 1) /* fractional offset */ |
351 | 0 | flags &= ~SWS_COMP_EXACT; |
352 | 0 | min = av_add_q(min, op->lin.m[i][4]); |
353 | 0 | max = av_add_q(max, op->lin.m[i][4]); |
354 | 0 | } |
355 | 0 | op->comps.flags[i] = flags; |
356 | 0 | op->comps.min[i] = min; |
357 | 0 | op->comps.max[i] = max; |
358 | 0 | } |
359 | 0 | break; |
360 | 0 | case SWS_OP_SCALE: |
361 | 0 | for (int i = 0; i < 4; i++) { |
362 | 0 | op->comps.flags[i] = prev.flags[i]; |
363 | 0 | if (op->c.q.den != 1) /* fractional scale */ |
364 | 0 | op->comps.flags[i] &= ~SWS_COMP_EXACT; |
365 | 0 | if (op->c.q.num < 0) |
366 | 0 | FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]); |
367 | 0 | } |
368 | 0 | break; |
369 | | |
370 | 0 | case SWS_OP_INVALID: |
371 | 0 | case SWS_OP_TYPE_NB: |
372 | 0 | av_unreachable("Invalid operation type!"); |
373 | 0 | } |
374 | | |
375 | 0 | prev = op->comps; |
376 | 0 | } |
377 | | |
378 | | /* Backwards pass, solves for component dependencies */ |
379 | 0 | for (int n = ops->num_ops - 1; n >= 0; n--) { |
380 | 0 | SwsOp *op = &ops->ops[n]; |
381 | |
|
382 | 0 | switch (op->op) { |
383 | 0 | case SWS_OP_READ: |
384 | 0 | case SWS_OP_WRITE: |
385 | 0 | for (int i = 0; i < op->rw.elems; i++) |
386 | 0 | op->comps.unused[i] = op->op == SWS_OP_READ; |
387 | 0 | for (int i = op->rw.elems; i < 4; i++) |
388 | 0 | op->comps.unused[i] = next.unused[i]; |
389 | 0 | break; |
390 | 0 | case SWS_OP_SWAP_BYTES: |
391 | 0 | case SWS_OP_LSHIFT: |
392 | 0 | case SWS_OP_RSHIFT: |
393 | 0 | case SWS_OP_CONVERT: |
394 | 0 | case SWS_OP_DITHER: |
395 | 0 | case SWS_OP_MIN: |
396 | 0 | case SWS_OP_MAX: |
397 | 0 | case SWS_OP_SCALE: |
398 | 0 | for (int i = 0; i < 4; i++) |
399 | 0 | op->comps.unused[i] = next.unused[i]; |
400 | 0 | break; |
401 | 0 | case SWS_OP_UNPACK: { |
402 | 0 | bool unused = true; |
403 | 0 | for (int i = 0; i < 4; i++) { |
404 | 0 | if (op->pack.pattern[i]) |
405 | 0 | unused &= next.unused[i]; |
406 | 0 | op->comps.unused[i] = i > 0; |
407 | 0 | } |
408 | 0 | op->comps.unused[0] = unused; |
409 | 0 | break; |
410 | 0 | } |
411 | 0 | case SWS_OP_PACK: |
412 | 0 | for (int i = 0; i < 4; i++) { |
413 | 0 | if (op->pack.pattern[i]) |
414 | 0 | op->comps.unused[i] = next.unused[0]; |
415 | 0 | else |
416 | 0 | op->comps.unused[i] = true; |
417 | 0 | } |
418 | 0 | break; |
419 | 0 | case SWS_OP_CLEAR: |
420 | 0 | for (int i = 0; i < 4; i++) { |
421 | 0 | if (op->c.q4[i].den) |
422 | 0 | op->comps.unused[i] = true; |
423 | 0 | else |
424 | 0 | op->comps.unused[i] = next.unused[i]; |
425 | 0 | } |
426 | 0 | break; |
427 | 0 | case SWS_OP_SWIZZLE: { |
428 | 0 | bool unused[4] = { true, true, true, true }; |
429 | 0 | for (int i = 0; i < 4; i++) |
430 | 0 | unused[op->swizzle.in[i]] &= next.unused[i]; |
431 | 0 | for (int i = 0; i < 4; i++) |
432 | 0 | op->comps.unused[i] = unused[i]; |
433 | 0 | break; |
434 | 0 | } |
435 | 0 | case SWS_OP_LINEAR: |
436 | 0 | for (int j = 0; j < 4; j++) { |
437 | 0 | bool unused = true; |
438 | 0 | for (int i = 0; i < 4; i++) { |
439 | 0 | if (op->lin.m[i][j].num) |
440 | 0 | unused &= next.unused[i]; |
441 | 0 | } |
442 | 0 | op->comps.unused[j] = unused; |
443 | 0 | } |
444 | 0 | break; |
445 | 0 | } |
446 | | |
447 | 0 | next = op->comps; |
448 | 0 | } |
449 | 0 | } |
450 | | |
451 | | static void op_uninit(SwsOp *op) |
452 | 0 | { |
453 | 0 | switch (op->op) { |
454 | 0 | case SWS_OP_DITHER: |
455 | 0 | av_refstruct_unref(&op->dither.matrix); |
456 | 0 | break; |
457 | 0 | } |
458 | | |
459 | 0 | *op = (SwsOp) {0}; |
460 | 0 | } |
461 | | |
462 | | SwsOpList *ff_sws_op_list_alloc(void) |
463 | 0 | { |
464 | 0 | SwsOpList *ops = av_mallocz(sizeof(SwsOpList)); |
465 | 0 | if (!ops) |
466 | 0 | return NULL; |
467 | | |
468 | 0 | ff_fmt_clear(&ops->src); |
469 | 0 | ff_fmt_clear(&ops->dst); |
470 | 0 | return ops; |
471 | 0 | } |
472 | | |
473 | | void ff_sws_op_list_free(SwsOpList **p_ops) |
474 | 0 | { |
475 | 0 | SwsOpList *ops = *p_ops; |
476 | 0 | if (!ops) |
477 | 0 | return; |
478 | | |
479 | 0 | for (int i = 0; i < ops->num_ops; i++) |
480 | 0 | op_uninit(&ops->ops[i]); |
481 | |
|
482 | 0 | av_freep(&ops->ops); |
483 | 0 | av_free(ops); |
484 | 0 | *p_ops = NULL; |
485 | 0 | } |
486 | | |
487 | | SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops) |
488 | 0 | { |
489 | 0 | SwsOpList *copy = av_malloc(sizeof(*copy)); |
490 | 0 | if (!copy) |
491 | 0 | return NULL; |
492 | | |
493 | 0 | int num = ops->num_ops; |
494 | 0 | if (num) |
495 | 0 | num = 1 << av_ceil_log2(num); |
496 | |
|
497 | 0 | *copy = *ops; |
498 | 0 | copy->ops = av_memdup(ops->ops, num * sizeof(ops->ops[0])); |
499 | 0 | if (!copy->ops) { |
500 | 0 | av_free(copy); |
501 | 0 | return NULL; |
502 | 0 | } |
503 | | |
504 | 0 | for (int i = 0; i < ops->num_ops; i++) { |
505 | 0 | const SwsOp *op = &ops->ops[i]; |
506 | 0 | switch (op->op) { |
507 | 0 | case SWS_OP_DITHER: |
508 | 0 | av_refstruct_ref(copy->ops[i].dither.matrix); |
509 | 0 | break; |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | 0 | return copy; |
514 | 0 | } |
515 | | |
516 | | void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count) |
517 | 0 | { |
518 | 0 | const int end = ops->num_ops - count; |
519 | 0 | av_assert2(index >= 0 && count >= 0 && index + count <= ops->num_ops); |
520 | 0 | op_uninit(&ops->ops[index]); |
521 | 0 | for (int i = index; i < end; i++) |
522 | 0 | ops->ops[i] = ops->ops[i + count]; |
523 | 0 | ops->num_ops = end; |
524 | 0 | } |
525 | | |
526 | | int ff_sws_op_list_insert_at(SwsOpList *ops, int index, SwsOp *op) |
527 | 0 | { |
528 | 0 | void *ret = av_dynarray2_add((void **) &ops->ops, &ops->num_ops, sizeof(*op), NULL); |
529 | 0 | if (!ret) { |
530 | 0 | op_uninit(op); |
531 | 0 | return AVERROR(ENOMEM); |
532 | 0 | } |
533 | | |
534 | 0 | for (int i = ops->num_ops - 1; i > index; i--) |
535 | 0 | ops->ops[i] = ops->ops[i - 1]; |
536 | 0 | ops->ops[index] = *op; |
537 | 0 | return 0; |
538 | 0 | } |
539 | | |
540 | | int ff_sws_op_list_append(SwsOpList *ops, SwsOp *op) |
541 | 0 | { |
542 | 0 | return ff_sws_op_list_insert_at(ops, ops->num_ops, op); |
543 | 0 | } |
544 | | |
545 | | int ff_sws_op_list_max_size(const SwsOpList *ops) |
546 | 0 | { |
547 | 0 | int max_size = 0; |
548 | 0 | for (int i = 0; i < ops->num_ops; i++) { |
549 | 0 | const int size = ff_sws_pixel_type_size(ops->ops[i].type); |
550 | 0 | max_size = FFMAX(max_size, size); |
551 | 0 | } |
552 | |
|
553 | 0 | return max_size; |
554 | 0 | } |
555 | | |
556 | | uint32_t ff_sws_linear_mask(const SwsLinearOp c) |
557 | 0 | { |
558 | 0 | uint32_t mask = 0; |
559 | 0 | for (int i = 0; i < 4; i++) { |
560 | 0 | for (int j = 0; j < 5; j++) { |
561 | 0 | if (av_cmp_q(c.m[i][j], Q(i == j))) |
562 | 0 | mask |= SWS_MASK(i, j); |
563 | 0 | } |
564 | 0 | } |
565 | 0 | return mask; |
566 | 0 | } |
567 | | |
568 | | static const char *describe_lin_mask(uint32_t mask) |
569 | 0 | { |
570 | | /* Try to be fairly descriptive without assuming too much */ |
571 | 0 | static const struct { |
572 | 0 | char name[24]; |
573 | 0 | uint32_t mask; |
574 | 0 | } patterns[] = { |
575 | 0 | { "noop", 0 }, |
576 | 0 | { "luma", SWS_MASK_LUMA }, |
577 | 0 | { "alpha", SWS_MASK_ALPHA }, |
578 | 0 | { "luma+alpha", SWS_MASK_LUMA | SWS_MASK_ALPHA }, |
579 | 0 | { "dot3", 0x7 }, |
580 | 0 | { "dot4", 0xF }, |
581 | 0 | { "row0", SWS_MASK_ROW(0) }, |
582 | 0 | { "row0+alpha", SWS_MASK_ROW(0) | SWS_MASK_ALPHA }, |
583 | 0 | { "col0", SWS_MASK_COL(0) }, |
584 | 0 | { "col0+off3", SWS_MASK_COL(0) | SWS_MASK_OFF3 }, |
585 | 0 | { "off3", SWS_MASK_OFF3 }, |
586 | 0 | { "off3+alpha", SWS_MASK_OFF3 | SWS_MASK_ALPHA }, |
587 | 0 | { "diag3", SWS_MASK_DIAG3 }, |
588 | 0 | { "diag4", SWS_MASK_DIAG4 }, |
589 | 0 | { "diag3+alpha", SWS_MASK_DIAG3 | SWS_MASK_ALPHA }, |
590 | 0 | { "diag3+off3", SWS_MASK_DIAG3 | SWS_MASK_OFF3 }, |
591 | 0 | { "diag3+off3+alpha", SWS_MASK_DIAG3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA }, |
592 | 0 | { "diag4+off4", SWS_MASK_DIAG4 | SWS_MASK_OFF4 }, |
593 | 0 | { "matrix3", SWS_MASK_MAT3 }, |
594 | 0 | { "matrix3+off3", SWS_MASK_MAT3 | SWS_MASK_OFF3 }, |
595 | 0 | { "matrix3+off3+alpha", SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA }, |
596 | 0 | { "matrix4", SWS_MASK_MAT4 }, |
597 | 0 | { "matrix4+off4", SWS_MASK_MAT4 | SWS_MASK_OFF4 }, |
598 | 0 | }; |
599 | |
|
600 | 0 | for (int i = 0; i < FF_ARRAY_ELEMS(patterns); i++) { |
601 | 0 | if (!(mask & ~patterns[i].mask)) |
602 | 0 | return patterns[i].name; |
603 | 0 | } |
604 | | |
605 | 0 | av_unreachable("Invalid linear mask!"); |
606 | 0 | return "ERR"; |
607 | 0 | } |
608 | | |
609 | | static char describe_comp_flags(unsigned flags) |
610 | 0 | { |
611 | 0 | if (flags & SWS_COMP_GARBAGE) |
612 | 0 | return 'X'; |
613 | 0 | else if (flags & SWS_COMP_ZERO) |
614 | 0 | return '0'; |
615 | 0 | else if (flags & SWS_COMP_SWAPPED) |
616 | 0 | return 'z'; |
617 | 0 | else if (flags & SWS_COMP_EXACT) |
618 | 0 | return '+'; |
619 | 0 | else |
620 | 0 | return '.'; |
621 | 0 | } |
622 | | |
623 | | static const char *print_q(const AVRational q, char buf[], int buf_len) |
624 | 0 | { |
625 | 0 | if (!q.den) { |
626 | 0 | return q.num > 0 ? "inf" : q.num < 0 ? "-inf" : "nan"; |
627 | 0 | } else if (q.den == 1) { |
628 | 0 | snprintf(buf, buf_len, "%d", q.num); |
629 | 0 | return buf; |
630 | 0 | } else if (abs(q.num) > 1000 || abs(q.den) > 1000) { |
631 | 0 | snprintf(buf, buf_len, "%f", av_q2d(q)); |
632 | 0 | return buf; |
633 | 0 | } else { |
634 | 0 | snprintf(buf, buf_len, "%d/%d", q.num, q.den); |
635 | 0 | return buf; |
636 | 0 | } |
637 | 0 | } |
638 | | |
639 | 0 | #define PRINTQ(q) print_q(q, (char[32]){0}, sizeof(char[32]) - 1) |
640 | | |
641 | | void ff_sws_op_list_print(void *log, int lev, const SwsOpList *ops) |
642 | 0 | { |
643 | 0 | if (!ops->num_ops) { |
644 | 0 | av_log(log, lev, " (empty)\n"); |
645 | 0 | return; |
646 | 0 | } |
647 | | |
648 | 0 | for (int i = 0; i < ops->num_ops; i++) { |
649 | 0 | const SwsOp *op = &ops->ops[i]; |
650 | 0 | av_log(log, lev, " [%3s %c%c%c%c -> %c%c%c%c] ", |
651 | 0 | ff_sws_pixel_type_name(op->type), |
652 | 0 | op->comps.unused[0] ? 'X' : '.', |
653 | 0 | op->comps.unused[1] ? 'X' : '.', |
654 | 0 | op->comps.unused[2] ? 'X' : '.', |
655 | 0 | op->comps.unused[3] ? 'X' : '.', |
656 | 0 | describe_comp_flags(op->comps.flags[0]), |
657 | 0 | describe_comp_flags(op->comps.flags[1]), |
658 | 0 | describe_comp_flags(op->comps.flags[2]), |
659 | 0 | describe_comp_flags(op->comps.flags[3])); |
660 | |
|
661 | 0 | switch (op->op) { |
662 | 0 | case SWS_OP_INVALID: |
663 | 0 | av_log(log, lev, "SWS_OP_INVALID\n"); |
664 | 0 | break; |
665 | 0 | case SWS_OP_READ: |
666 | 0 | case SWS_OP_WRITE: |
667 | 0 | av_log(log, lev, "%-20s: %d elem(s) %s >> %d\n", |
668 | 0 | op->op == SWS_OP_READ ? "SWS_OP_READ" |
669 | 0 | : "SWS_OP_WRITE", |
670 | 0 | op->rw.elems, op->rw.packed ? "packed" : "planar", |
671 | 0 | op->rw.frac); |
672 | 0 | break; |
673 | 0 | case SWS_OP_SWAP_BYTES: |
674 | 0 | av_log(log, lev, "SWS_OP_SWAP_BYTES\n"); |
675 | 0 | break; |
676 | 0 | case SWS_OP_LSHIFT: |
677 | 0 | av_log(log, lev, "%-20s: << %u\n", "SWS_OP_LSHIFT", op->c.u); |
678 | 0 | break; |
679 | 0 | case SWS_OP_RSHIFT: |
680 | 0 | av_log(log, lev, "%-20s: >> %u\n", "SWS_OP_RSHIFT", op->c.u); |
681 | 0 | break; |
682 | 0 | case SWS_OP_PACK: |
683 | 0 | case SWS_OP_UNPACK: |
684 | 0 | av_log(log, lev, "%-20s: {%d %d %d %d}\n", |
685 | 0 | op->op == SWS_OP_PACK ? "SWS_OP_PACK" |
686 | 0 | : "SWS_OP_UNPACK", |
687 | 0 | op->pack.pattern[0], op->pack.pattern[1], |
688 | 0 | op->pack.pattern[2], op->pack.pattern[3]); |
689 | 0 | break; |
690 | 0 | case SWS_OP_CLEAR: |
691 | 0 | av_log(log, lev, "%-20s: {%s %s %s %s}\n", "SWS_OP_CLEAR", |
692 | 0 | op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", |
693 | 0 | op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", |
694 | 0 | op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", |
695 | 0 | op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); |
696 | 0 | break; |
697 | 0 | case SWS_OP_SWIZZLE: |
698 | 0 | av_log(log, lev, "%-20s: %d%d%d%d\n", "SWS_OP_SWIZZLE", |
699 | 0 | op->swizzle.x, op->swizzle.y, op->swizzle.z, op->swizzle.w); |
700 | 0 | break; |
701 | 0 | case SWS_OP_CONVERT: |
702 | 0 | av_log(log, lev, "%-20s: %s -> %s%s\n", "SWS_OP_CONVERT", |
703 | 0 | ff_sws_pixel_type_name(op->type), |
704 | 0 | ff_sws_pixel_type_name(op->convert.to), |
705 | 0 | op->convert.expand ? " (expand)" : ""); |
706 | 0 | break; |
707 | 0 | case SWS_OP_DITHER: |
708 | 0 | av_log(log, lev, "%-20s: %dx%d matrix + {%d %d %d %d}\n", "SWS_OP_DITHER", |
709 | 0 | 1 << op->dither.size_log2, 1 << op->dither.size_log2, |
710 | 0 | op->dither.y_offset[0], op->dither.y_offset[1], |
711 | 0 | op->dither.y_offset[2], op->dither.y_offset[3]); |
712 | 0 | break; |
713 | 0 | case SWS_OP_MIN: |
714 | 0 | av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", "SWS_OP_MIN", |
715 | 0 | op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", |
716 | 0 | op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", |
717 | 0 | op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", |
718 | 0 | op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); |
719 | 0 | break; |
720 | 0 | case SWS_OP_MAX: |
721 | 0 | av_log(log, lev, "%-20s: {%s %s %s %s} <= x\n", "SWS_OP_MAX", |
722 | 0 | op->c.q4[0].den ? PRINTQ(op->c.q4[0]) : "_", |
723 | 0 | op->c.q4[1].den ? PRINTQ(op->c.q4[1]) : "_", |
724 | 0 | op->c.q4[2].den ? PRINTQ(op->c.q4[2]) : "_", |
725 | 0 | op->c.q4[3].den ? PRINTQ(op->c.q4[3]) : "_"); |
726 | 0 | break; |
727 | 0 | case SWS_OP_LINEAR: |
728 | 0 | av_log(log, lev, "%-20s: %s [[%s %s %s %s %s] " |
729 | 0 | "[%s %s %s %s %s] " |
730 | 0 | "[%s %s %s %s %s] " |
731 | 0 | "[%s %s %s %s %s]]\n", |
732 | 0 | "SWS_OP_LINEAR", describe_lin_mask(op->lin.mask), |
733 | 0 | PRINTQ(op->lin.m[0][0]), PRINTQ(op->lin.m[0][1]), PRINTQ(op->lin.m[0][2]), PRINTQ(op->lin.m[0][3]), PRINTQ(op->lin.m[0][4]), |
734 | 0 | PRINTQ(op->lin.m[1][0]), PRINTQ(op->lin.m[1][1]), PRINTQ(op->lin.m[1][2]), PRINTQ(op->lin.m[1][3]), PRINTQ(op->lin.m[1][4]), |
735 | 0 | PRINTQ(op->lin.m[2][0]), PRINTQ(op->lin.m[2][1]), PRINTQ(op->lin.m[2][2]), PRINTQ(op->lin.m[2][3]), PRINTQ(op->lin.m[2][4]), |
736 | 0 | PRINTQ(op->lin.m[3][0]), PRINTQ(op->lin.m[3][1]), PRINTQ(op->lin.m[3][2]), PRINTQ(op->lin.m[3][3]), PRINTQ(op->lin.m[3][4])); |
737 | 0 | break; |
738 | 0 | case SWS_OP_SCALE: |
739 | 0 | av_log(log, lev, "%-20s: * %s\n", "SWS_OP_SCALE", |
740 | 0 | PRINTQ(op->c.q)); |
741 | 0 | break; |
742 | 0 | case SWS_OP_TYPE_NB: |
743 | 0 | break; |
744 | 0 | } |
745 | | |
746 | 0 | if (op->comps.min[0].den || op->comps.min[1].den || |
747 | 0 | op->comps.min[2].den || op->comps.min[3].den || |
748 | 0 | op->comps.max[0].den || op->comps.max[1].den || |
749 | 0 | op->comps.max[2].den || op->comps.max[3].den) |
750 | 0 | { |
751 | 0 | av_log(log, AV_LOG_TRACE, " min: {%s, %s, %s, %s}, max: {%s, %s, %s, %s}\n", |
752 | 0 | PRINTQ(op->comps.min[0]), PRINTQ(op->comps.min[1]), |
753 | 0 | PRINTQ(op->comps.min[2]), PRINTQ(op->comps.min[3]), |
754 | 0 | PRINTQ(op->comps.max[0]), PRINTQ(op->comps.max[1]), |
755 | 0 | PRINTQ(op->comps.max[2]), PRINTQ(op->comps.max[3])); |
756 | 0 | } |
757 | |
|
758 | 0 | } |
759 | | |
760 | 0 | av_log(log, lev, " (X = unused, z = byteswapped, + = exact, 0 = zero)\n"); |
761 | 0 | } |
762 | | |
763 | | int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, |
764 | | const SwsOpList *ops, SwsCompiledOp *out) |
765 | 0 | { |
766 | 0 | SwsOpList *copy, rest; |
767 | 0 | SwsCompiledOp compiled = {0}; |
768 | 0 | int ret = 0; |
769 | |
|
770 | 0 | copy = ff_sws_op_list_duplicate(ops); |
771 | 0 | if (!copy) |
772 | 0 | return AVERROR(ENOMEM); |
773 | | |
774 | | /* Ensure these are always set during compilation */ |
775 | 0 | ff_sws_op_list_update_comps(copy); |
776 | | |
777 | | /* Make an on-stack copy of `ops` to ensure we can still properly clean up |
778 | | * the copy afterwards */ |
779 | 0 | rest = *copy; |
780 | |
|
781 | 0 | ret = backend->compile(ctx, &rest, &compiled); |
782 | 0 | if (ret < 0) { |
783 | 0 | int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR; |
784 | 0 | av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n", |
785 | 0 | backend->name, av_err2str(ret)); |
786 | 0 | if (rest.num_ops != ops->num_ops) { |
787 | 0 | av_log(ctx, msg_lev, "Uncompiled remainder:\n"); |
788 | 0 | ff_sws_op_list_print(ctx, msg_lev, &rest); |
789 | 0 | } |
790 | 0 | } else { |
791 | 0 | *out = compiled; |
792 | 0 | } |
793 | |
|
794 | 0 | ff_sws_op_list_free(©); |
795 | 0 | return ret; |
796 | 0 | } |
797 | | |
798 | | int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) |
799 | 0 | { |
800 | 0 | for (int n = 0; ff_sws_op_backends[n]; n++) { |
801 | 0 | const SwsOpBackend *backend = ff_sws_op_backends[n]; |
802 | 0 | if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) |
803 | 0 | continue; |
804 | | |
805 | 0 | av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " |
806 | 0 | "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n", |
807 | 0 | backend->name, out->block_size, out->over_read, out->over_write, |
808 | 0 | out->cpu_flags); |
809 | 0 | return 0; |
810 | 0 | } |
811 | | |
812 | 0 | av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); |
813 | 0 | ff_sws_op_list_print(ctx, AV_LOG_WARNING, ops); |
814 | 0 | return AVERROR(ENOTSUP); |
815 | 0 | } |
816 | | |
817 | | typedef struct SwsOpPass { |
818 | | SwsCompiledOp comp; |
819 | | SwsOpExec exec_base; |
820 | | int num_blocks; |
821 | | int tail_off_in; |
822 | | int tail_off_out; |
823 | | int tail_size_in; |
824 | | int tail_size_out; |
825 | | int planes_in; |
826 | | int planes_out; |
827 | | int pixel_bits_in; |
828 | | int pixel_bits_out; |
829 | | bool memcpy_in; |
830 | | bool memcpy_out; |
831 | | } SwsOpPass; |
832 | | |
833 | | static void op_pass_free(void *ptr) |
834 | 0 | { |
835 | 0 | SwsOpPass *p = ptr; |
836 | 0 | if (!p) |
837 | 0 | return; |
838 | | |
839 | 0 | if (p->comp.free) |
840 | 0 | p->comp.free(p->comp.priv); |
841 | |
|
842 | 0 | av_free(p); |
843 | 0 | } |
844 | | |
845 | | static void op_pass_setup(const SwsImg *out, const SwsImg *in, const SwsPass *pass) |
846 | 0 | { |
847 | 0 | const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->fmt); |
848 | 0 | const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->fmt); |
849 | |
|
850 | 0 | SwsOpPass *p = pass->priv; |
851 | 0 | SwsOpExec *exec = &p->exec_base; |
852 | 0 | const SwsCompiledOp *comp = &p->comp; |
853 | 0 | const int block_size = comp->block_size; |
854 | 0 | p->num_blocks = (pass->width + block_size - 1) / block_size; |
855 | | |
856 | | /* Set up main loop parameters */ |
857 | 0 | const int aligned_w = p->num_blocks * block_size; |
858 | 0 | const int safe_width = (p->num_blocks - 1) * block_size; |
859 | 0 | const int tail_size = pass->width - safe_width; |
860 | 0 | p->tail_off_in = safe_width * p->pixel_bits_in >> 3; |
861 | 0 | p->tail_off_out = safe_width * p->pixel_bits_out >> 3; |
862 | 0 | p->tail_size_in = tail_size * p->pixel_bits_in >> 3; |
863 | 0 | p->tail_size_out = tail_size * p->pixel_bits_out >> 3; |
864 | 0 | p->memcpy_in = false; |
865 | 0 | p->memcpy_out = false; |
866 | |
|
867 | 0 | for (int i = 0; i < p->planes_in; i++) { |
868 | 0 | const int sub_x = (i == 1 || i == 2) ? indesc->log2_chroma_w : 0; |
869 | 0 | const int plane_w = (aligned_w + sub_x) >> sub_x; |
870 | 0 | const int plane_pad = (comp->over_read + sub_x) >> sub_x; |
871 | 0 | const int plane_size = plane_w * p->pixel_bits_in >> 3; |
872 | 0 | p->memcpy_in |= plane_size + plane_pad > in->linesize[i]; |
873 | 0 | exec->in_stride[i] = in->linesize[i]; |
874 | 0 | } |
875 | |
|
876 | 0 | for (int i = 0; i < p->planes_out; i++) { |
877 | 0 | const int sub_x = (i == 1 || i == 2) ? outdesc->log2_chroma_w : 0; |
878 | 0 | const int plane_w = (aligned_w + sub_x) >> sub_x; |
879 | 0 | const int plane_pad = (comp->over_write + sub_x) >> sub_x; |
880 | 0 | const int plane_size = plane_w * p->pixel_bits_out >> 3; |
881 | 0 | p->memcpy_out |= plane_size + plane_pad > out->linesize[i]; |
882 | 0 | exec->out_stride[i] = out->linesize[i]; |
883 | 0 | } |
884 | | |
885 | | /* Pre-fill pointer bump for the main section only; this value does not |
886 | | * matter at all for the tail / last row handlers because they only ever |
887 | | * process a single line */ |
888 | 0 | const int blocks_main = p->num_blocks - p->memcpy_out; |
889 | 0 | for (int i = 0; i < 4; i++) { |
890 | 0 | exec->in_bump[i] = in->linesize[i] - blocks_main * exec->block_size_in; |
891 | 0 | exec->out_bump[i] = out->linesize[i] - blocks_main * exec->block_size_out; |
892 | 0 | } |
893 | 0 | } |
894 | | |
895 | | /* Dispatch kernel over the last column of the image using memcpy */ |
896 | | static av_always_inline void |
897 | | handle_tail(const SwsOpPass *p, SwsOpExec *exec, |
898 | | const SwsImg *out_base, const bool copy_out, |
899 | | const SwsImg *in_base, const bool copy_in, |
900 | | int y, const int h) |
901 | 0 | { |
902 | 0 | DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])]; |
903 | |
|
904 | 0 | const SwsCompiledOp *comp = &p->comp; |
905 | 0 | const int tail_size_in = p->tail_size_in; |
906 | 0 | const int tail_size_out = p->tail_size_out; |
907 | 0 | const int bx = p->num_blocks - 1; |
908 | |
|
909 | 0 | SwsImg in = ff_sws_img_shift(in_base, y); |
910 | 0 | SwsImg out = ff_sws_img_shift(out_base, y); |
911 | 0 | for (int i = 0; i < p->planes_in; i++) { |
912 | 0 | in.data[i] += p->tail_off_in; |
913 | 0 | if (copy_in) { |
914 | 0 | exec->in[i] = (void *) tmp[0][i]; |
915 | 0 | exec->in_stride[i] = sizeof(tmp[0][i]); |
916 | 0 | } else { |
917 | 0 | exec->in[i] = in.data[i]; |
918 | 0 | } |
919 | 0 | } |
920 | |
|
921 | 0 | for (int i = 0; i < p->planes_out; i++) { |
922 | 0 | out.data[i] += p->tail_off_out; |
923 | 0 | if (copy_out) { |
924 | 0 | exec->out[i] = (void *) tmp[1][i]; |
925 | 0 | exec->out_stride[i] = sizeof(tmp[1][i]); |
926 | 0 | } else { |
927 | 0 | exec->out[i] = out.data[i]; |
928 | 0 | } |
929 | 0 | } |
930 | |
|
931 | 0 | for (int y_end = y + h; y < y_end; y++) { |
932 | 0 | if (copy_in) { |
933 | 0 | for (int i = 0; i < p->planes_in; i++) { |
934 | 0 | av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]); |
935 | 0 | memcpy(tmp[0][i], in.data[i], tail_size_in); |
936 | 0 | in.data[i] += in.linesize[i]; |
937 | 0 | } |
938 | 0 | } |
939 | |
|
940 | 0 | comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1); |
941 | |
|
942 | 0 | if (copy_out) { |
943 | 0 | for (int i = 0; i < p->planes_out; i++) { |
944 | 0 | av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]); |
945 | 0 | memcpy(out.data[i], tmp[1][i], tail_size_out); |
946 | 0 | out.data[i] += out.linesize[i]; |
947 | 0 | } |
948 | 0 | } |
949 | |
|
950 | 0 | for (int i = 0; i < 4; i++) { |
951 | 0 | if (!copy_in) |
952 | 0 | exec->in[i] += in.linesize[i]; |
953 | 0 | if (!copy_out) |
954 | 0 | exec->out[i] += out.linesize[i]; |
955 | 0 | } |
956 | 0 | } |
957 | 0 | } |
958 | | |
959 | | static void op_pass_run(const SwsImg *out_base, const SwsImg *in_base, |
960 | | const int y, const int h, const SwsPass *pass) |
961 | 0 | { |
962 | 0 | const SwsOpPass *p = pass->priv; |
963 | 0 | const SwsCompiledOp *comp = &p->comp; |
964 | 0 | const SwsImg in = ff_sws_img_shift(in_base, y); |
965 | 0 | const SwsImg out = ff_sws_img_shift(out_base, y); |
966 | | |
967 | | /* Fill exec metadata for this slice */ |
968 | 0 | DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base; |
969 | 0 | exec.slice_y = y; |
970 | 0 | exec.slice_h = h; |
971 | 0 | for (int i = 0; i < 4; i++) { |
972 | 0 | exec.in[i] = in.data[i]; |
973 | 0 | exec.out[i] = out.data[i]; |
974 | 0 | } |
975 | | |
976 | | /** |
977 | | * To ensure safety, we need to consider the following: |
978 | | * |
979 | | * 1. We can overread the input, unless this is the last line of an |
980 | | * unpadded buffer. All defined operations can handle arbitrary pixel |
981 | | * input, so overread of arbitrary data is fine. |
982 | | * |
983 | | * 2. We can overwrite the output, as long as we don't write more than the |
984 | | * amount of pixels that fit into one linesize. So we always need to |
985 | | * memcpy the last column on the output side if unpadded. |
986 | | * |
987 | | * 3. For the last row, we also need to memcpy the remainder of the input, |
988 | | * to avoid reading past the end of the buffer. Note that since we know |
989 | | * the run() function is called on stripes of the same buffer, we don't |
990 | | * need to worry about this for the end of a slice. |
991 | | */ |
992 | |
|
993 | 0 | const int last_slice = y + h == pass->height; |
994 | 0 | const bool memcpy_in = last_slice && p->memcpy_in; |
995 | 0 | const bool memcpy_out = p->memcpy_out; |
996 | 0 | const int num_blocks = p->num_blocks; |
997 | 0 | const int blocks_main = num_blocks - memcpy_out; |
998 | 0 | const int h_main = h - memcpy_in; |
999 | | |
1000 | | /* Handle main section */ |
1001 | 0 | comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main); |
1002 | |
|
1003 | 0 | if (memcpy_in) { |
1004 | | /* Safe part of last row */ |
1005 | 0 | for (int i = 0; i < 4; i++) { |
1006 | 0 | exec.in[i] += h_main * in.linesize[i]; |
1007 | 0 | exec.out[i] += h_main * out.linesize[i]; |
1008 | 0 | } |
1009 | 0 | comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h); |
1010 | 0 | } |
1011 | | |
1012 | | /* Handle last column via memcpy, takes over `exec` so call these last */ |
1013 | 0 | if (memcpy_out) |
1014 | 0 | handle_tail(p, &exec, out_base, true, in_base, false, y, h_main); |
1015 | 0 | if (memcpy_in) |
1016 | 0 | handle_tail(p, &exec, out_base, memcpy_out, in_base, true, y + h_main, 1); |
1017 | 0 | } |
1018 | | |
1019 | | static int rw_planes(const SwsOp *op) |
1020 | 0 | { |
1021 | 0 | return op->rw.packed ? 1 : op->rw.elems; |
1022 | 0 | } |
1023 | | |
1024 | | static int rw_pixel_bits(const SwsOp *op) |
1025 | 0 | { |
1026 | 0 | const int elems = op->rw.packed ? op->rw.elems : 1; |
1027 | 0 | const int size = ff_sws_pixel_type_size(op->type); |
1028 | 0 | const int bits = 8 >> op->rw.frac; |
1029 | 0 | av_assert1(bits >= 1); |
1030 | 0 | return elems * size * bits; |
1031 | 0 | } |
1032 | | |
1033 | | int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, SwsFormat dst, |
1034 | | SwsPass *input, SwsPass **output) |
1035 | 0 | { |
1036 | 0 | SwsContext *ctx = graph->ctx; |
1037 | 0 | SwsOpPass *p = NULL; |
1038 | 0 | const SwsOp *read = &ops->ops[0]; |
1039 | 0 | const SwsOp *write = &ops->ops[ops->num_ops - 1]; |
1040 | 0 | SwsPass *pass; |
1041 | 0 | int ret; |
1042 | |
|
1043 | 0 | if (ops->num_ops < 2) { |
1044 | 0 | av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n"); |
1045 | 0 | return AVERROR(EINVAL); |
1046 | 0 | } |
1047 | | |
1048 | 0 | if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) { |
1049 | 0 | av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " |
1050 | 0 | "and write, respectively.\n"); |
1051 | 0 | return AVERROR(EINVAL); |
1052 | 0 | } |
1053 | | |
1054 | 0 | if (flags & SWS_OP_FLAG_OPTIMIZE) |
1055 | 0 | RET(ff_sws_op_list_optimize(ops)); |
1056 | 0 | else |
1057 | 0 | ff_sws_op_list_update_comps(ops); |
1058 | | |
1059 | 0 | p = av_mallocz(sizeof(*p)); |
1060 | 0 | if (!p) |
1061 | 0 | return AVERROR(ENOMEM); |
1062 | | |
1063 | 0 | ret = ff_sws_ops_compile(ctx, ops, &p->comp); |
1064 | 0 | if (ret < 0) |
1065 | 0 | goto fail; |
1066 | | |
1067 | 0 | p->planes_in = rw_planes(read); |
1068 | 0 | p->planes_out = rw_planes(write); |
1069 | 0 | p->pixel_bits_in = rw_pixel_bits(read); |
1070 | 0 | p->pixel_bits_out = rw_pixel_bits(write); |
1071 | 0 | p->exec_base = (SwsOpExec) { |
1072 | 0 | .width = dst.width, |
1073 | 0 | .height = dst.height, |
1074 | 0 | .block_size_in = p->comp.block_size * p->pixel_bits_in >> 3, |
1075 | 0 | .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3, |
1076 | 0 | }; |
1077 | |
|
1078 | 0 | pass = ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height, input, |
1079 | 0 | 1, p, op_pass_run); |
1080 | 0 | if (!pass) { |
1081 | 0 | ret = AVERROR(ENOMEM); |
1082 | 0 | goto fail; |
1083 | 0 | } |
1084 | 0 | pass->setup = op_pass_setup; |
1085 | 0 | pass->free = op_pass_free; |
1086 | |
|
1087 | 0 | *output = pass; |
1088 | 0 | return 0; |
1089 | | |
1090 | 0 | fail: |
1091 | 0 | op_pass_free(p); |
1092 | 0 | return ret; |
1093 | 0 | } |