/src/ffmpeg/libswscale/ops_optimizer.c
Line | Count | Source |
1 | | /** |
2 | | * Copyright (C) 2025 Niklas Haas |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include "libavutil/avassert.h" |
22 | | #include "libavutil/bswap.h" |
23 | | #include "libavutil/rational.h" |
24 | | |
25 | | #include "ops.h" |
26 | | #include "ops_internal.h" |
27 | | |
28 | | #define RET(x) \ |
29 | 0 | do { \ |
30 | 0 | if ((ret = (x)) < 0) \ |
31 | 0 | return ret; \ |
32 | 0 | } while (0) |
33 | | |
34 | | /** |
35 | | * Try to commute a clear op with the next operation. Makes any adjustments |
36 | | * to the operations as needed, but does not perform the actual commutation. |
37 | | * |
38 | | * Returns whether successful. |
39 | | */ |
40 | | static bool op_commute_clear(SwsOp *op, SwsOp *next) |
41 | 0 | { |
42 | 0 | SwsOp tmp; |
43 | |
|
44 | 0 | av_assert1(op->op == SWS_OP_CLEAR); |
45 | 0 | switch (next->op) { |
46 | 0 | case SWS_OP_CONVERT: |
47 | 0 | op->type = next->convert.to; |
48 | | /* fall through */ |
49 | 0 | case SWS_OP_LSHIFT: |
50 | 0 | case SWS_OP_RSHIFT: |
51 | 0 | case SWS_OP_DITHER: |
52 | 0 | case SWS_OP_MIN: |
53 | 0 | case SWS_OP_MAX: |
54 | 0 | case SWS_OP_SCALE: |
55 | 0 | case SWS_OP_READ: |
56 | 0 | case SWS_OP_SWIZZLE: |
57 | 0 | ff_sws_apply_op_q(next, op->c.q4); |
58 | 0 | return true; |
59 | 0 | case SWS_OP_SWAP_BYTES: |
60 | 0 | switch (next->type) { |
61 | 0 | case SWS_PIXEL_U16: |
62 | 0 | ff_sws_apply_op_q(next, op->c.q4); /* always works */ |
63 | 0 | return true; |
64 | 0 | case SWS_PIXEL_U32: |
65 | 0 | for (int i = 0; i < 4; i++) { |
66 | 0 | uint32_t v = av_bswap32(op->c.q4[i].num); |
67 | 0 | if (v > INT_MAX) |
68 | 0 | return false; /* can't represent as AVRational anymore */ |
69 | 0 | tmp.c.q4[i] = Q(v); |
70 | 0 | } |
71 | 0 | op->c = tmp.c; |
72 | 0 | return true; |
73 | 0 | default: |
74 | 0 | return false; |
75 | 0 | } |
76 | 0 | case SWS_OP_INVALID: |
77 | 0 | case SWS_OP_WRITE: |
78 | 0 | case SWS_OP_LINEAR: |
79 | 0 | case SWS_OP_PACK: |
80 | 0 | case SWS_OP_UNPACK: |
81 | 0 | case SWS_OP_CLEAR: |
82 | 0 | return false; |
83 | 0 | case SWS_OP_TYPE_NB: |
84 | 0 | break; |
85 | 0 | } |
86 | | |
87 | 0 | av_unreachable("Invalid operation type!"); |
88 | 0 | return false; |
89 | 0 | } |
90 | | |
91 | | /** |
92 | | * Try to commute a swizzle op with the next operation. Makes any adjustments |
93 | | * to the operations as needed, but does not perform the actual commutation. |
94 | | * |
95 | | * Returns whether successful. |
96 | | */ |
97 | | static bool op_commute_swizzle(SwsOp *op, SwsOp *next) |
98 | 0 | { |
99 | 0 | bool seen[4] = {0}; |
100 | |
|
101 | 0 | av_assert1(op->op == SWS_OP_SWIZZLE); |
102 | 0 | switch (next->op) { |
103 | 0 | case SWS_OP_CONVERT: |
104 | 0 | op->type = next->convert.to; |
105 | | /* fall through */ |
106 | 0 | case SWS_OP_SWAP_BYTES: |
107 | 0 | case SWS_OP_LSHIFT: |
108 | 0 | case SWS_OP_RSHIFT: |
109 | 0 | case SWS_OP_SCALE: |
110 | 0 | return true; |
111 | | |
112 | | /** |
113 | | * We can commute per-channel ops only if the per-channel constants are the |
114 | | * same for all duplicated channels; e.g.: |
115 | | * SWIZZLE {0, 0, 0, 3} |
116 | | * NEXT {x, x, x, w} |
117 | | * -> |
118 | | * NEXT {x, _, _, w} |
119 | | * SWIZZLE {0, 0, 0, 3} |
120 | | */ |
121 | 0 | case SWS_OP_MIN: |
122 | 0 | case SWS_OP_MAX: { |
123 | 0 | const SwsConst c = next->c; |
124 | 0 | for (int i = 0; i < 4; i++) { |
125 | 0 | if (next->comps.unused[i]) |
126 | 0 | continue; |
127 | 0 | const int j = op->swizzle.in[i]; |
128 | 0 | if (seen[j] && av_cmp_q(next->c.q4[j], c.q4[i])) |
129 | 0 | return false; |
130 | 0 | next->c.q4[j] = c.q4[i]; |
131 | 0 | seen[j] = true; |
132 | 0 | } |
133 | 0 | return true; |
134 | 0 | } |
135 | | |
136 | 0 | case SWS_OP_DITHER: { |
137 | 0 | const SwsDitherOp d = next->dither; |
138 | 0 | for (int i = 0; i < 4; i++) { |
139 | 0 | if (next->comps.unused[i]) |
140 | 0 | continue; |
141 | 0 | const int j = op->swizzle.in[i]; |
142 | 0 | if (seen[j] && next->dither.y_offset[j] != d.y_offset[i]) |
143 | 0 | return false; |
144 | 0 | next->dither.y_offset[j] = d.y_offset[i]; |
145 | 0 | seen[j] = true; |
146 | 0 | } |
147 | 0 | return true; |
148 | 0 | } |
149 | | |
150 | 0 | case SWS_OP_INVALID: |
151 | 0 | case SWS_OP_READ: |
152 | 0 | case SWS_OP_WRITE: |
153 | 0 | case SWS_OP_SWIZZLE: |
154 | 0 | case SWS_OP_CLEAR: |
155 | 0 | case SWS_OP_LINEAR: |
156 | 0 | case SWS_OP_PACK: |
157 | 0 | case SWS_OP_UNPACK: |
158 | 0 | return false; |
159 | 0 | case SWS_OP_TYPE_NB: |
160 | 0 | break; |
161 | 0 | } |
162 | | |
163 | 0 | av_unreachable("Invalid operation type!"); |
164 | 0 | return false; |
165 | 0 | } |
166 | | |
167 | | /* returns log2(x) only if x is a power of two, or 0 otherwise */ |
168 | | static int exact_log2(const int x) |
169 | 0 | { |
170 | 0 | int p; |
171 | 0 | if (x <= 0) |
172 | 0 | return 0; |
173 | 0 | p = av_log2(x); |
174 | 0 | return (1 << p) == x ? p : 0; |
175 | 0 | } |
176 | | |
177 | | static int exact_log2_q(const AVRational x) |
178 | 0 | { |
179 | 0 | if (x.den == 1) |
180 | 0 | return exact_log2(x.num); |
181 | 0 | else if (x.num == 1) |
182 | 0 | return -exact_log2(x.den); |
183 | 0 | else |
184 | 0 | return 0; |
185 | 0 | } |
186 | | |
187 | | /** |
188 | | * If a linear operation can be reduced to a scalar multiplication, returns |
189 | | * the corresponding scaling factor, or 0 otherwise. |
190 | | */ |
191 | | static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next, |
192 | | SwsConst *out_scale) |
193 | 0 | { |
194 | 0 | SwsConst scale = {0}; |
195 | | |
196 | | /* There are components not on the main diagonal */ |
197 | 0 | if (c->mask & ~SWS_MASK_DIAG4) |
198 | 0 | return false; |
199 | | |
200 | 0 | for (int i = 0; i < 4; i++) { |
201 | 0 | const AVRational s = c->m[i][i]; |
202 | 0 | if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i]) |
203 | 0 | continue; |
204 | 0 | if (scale.q.den && av_cmp_q(s, scale.q)) |
205 | 0 | return false; |
206 | 0 | scale.q = s; |
207 | 0 | } |
208 | | |
209 | 0 | if (scale.q.den) |
210 | 0 | *out_scale = scale; |
211 | 0 | return scale.q.den; |
212 | 0 | } |
213 | | |
214 | | /* Extracts an integer clear operation (subset) from the given linear op. */ |
215 | | static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev, |
216 | | SwsConst *out_clear) |
217 | 0 | { |
218 | 0 | SwsConst clear = {0}; |
219 | 0 | bool ret = false; |
220 | |
|
221 | 0 | for (int i = 0; i < 4; i++) { |
222 | 0 | bool const_row = c->m[i][4].den == 1; /* offset is integer */ |
223 | 0 | for (int j = 0; j < 4; j++) { |
224 | 0 | const_row &= c->m[i][j].num == 0 || /* scalar is zero */ |
225 | 0 | (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */ |
226 | 0 | } |
227 | 0 | if (const_row && (c->mask & SWS_MASK_ROW(i))) { |
228 | 0 | clear.q4[i] = c->m[i][4]; |
229 | 0 | for (int j = 0; j < 5; j++) |
230 | 0 | c->m[i][j] = Q(i == j); |
231 | 0 | c->mask &= ~SWS_MASK_ROW(i); |
232 | 0 | ret = true; |
233 | 0 | } |
234 | 0 | } |
235 | |
|
236 | 0 | if (ret) |
237 | 0 | *out_clear = clear; |
238 | 0 | return ret; |
239 | 0 | } |
240 | | |
241 | | /* Unswizzle a linear operation by aligning single-input rows with |
242 | | * their corresponding diagonal */ |
243 | | static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz) |
244 | 0 | { |
245 | 0 | SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3); |
246 | 0 | SwsLinearOp c = *op; |
247 | | |
248 | | /* Find non-zero coefficients in the main 4x4 matrix */ |
249 | 0 | uint32_t nonzero = 0; |
250 | 0 | for (int i = 0; i < 4; i++) { |
251 | 0 | for (int j = 0; j < 4; j++) { |
252 | 0 | if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO)) |
253 | 0 | continue; |
254 | 0 | nonzero |= SWS_MASK(i, j); |
255 | 0 | } |
256 | 0 | } |
257 | | |
258 | | /* If a value is unique in its row and the target column is |
259 | | * empty, move it there and update the input swizzle */ |
260 | 0 | for (int i = 0; i < 4; i++) { |
261 | 0 | if (nonzero & SWS_MASK_COL(i)) |
262 | 0 | continue; /* target column is not empty */ |
263 | 0 | for (int j = 0; j < 4; j++) { |
264 | 0 | if ((nonzero & SWS_MASK_ROW(i)) == SWS_MASK(i, j)) { |
265 | | /* Move coefficient to the diagonal */ |
266 | 0 | c.m[i][i] = c.m[i][j]; |
267 | 0 | c.m[i][j] = Q(0); |
268 | 0 | swiz.in[i] = j; |
269 | 0 | break; |
270 | 0 | } |
271 | 0 | } |
272 | 0 | } |
273 | |
|
274 | 0 | if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask) |
275 | 0 | return false; /* no swizzle was identified */ |
276 | | |
277 | 0 | c.mask = ff_sws_linear_mask(c); |
278 | 0 | *out_swiz = swiz; |
279 | 0 | *op = c; |
280 | 0 | return true; |
281 | 0 | } |
282 | | |
283 | | int ff_sws_op_list_optimize(SwsOpList *ops) |
284 | 0 | { |
285 | 0 | int ret; |
286 | |
|
287 | 0 | retry: |
288 | 0 | ff_sws_op_list_update_comps(ops); |
289 | | |
290 | | /* Apply all in-place optimizations (that do not re-order the list) */ |
291 | 0 | for (int n = 0; n < ops->num_ops; n++) { |
292 | 0 | SwsOp dummy = {0}; |
293 | 0 | SwsOp *op = &ops->ops[n]; |
294 | 0 | SwsOp *prev = n ? &ops->ops[n - 1] : &dummy; |
295 | 0 | SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy; |
296 | | |
297 | | /* common helper variable */ |
298 | 0 | bool noop = true; |
299 | |
|
300 | 0 | if (next->comps.unused[0] && next->comps.unused[1] && |
301 | 0 | next->comps.unused[2] && next->comps.unused[3]) |
302 | 0 | { |
303 | | /* Remove completely unused operations */ |
304 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
305 | 0 | goto retry; |
306 | 0 | } |
307 | | |
308 | 0 | switch (op->op) { |
309 | 0 | case SWS_OP_READ: |
310 | | /* "Compress" planar reads where not all components are needed */ |
311 | 0 | if (!op->rw.packed) { |
312 | 0 | SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3); |
313 | 0 | int nb_planes = 0; |
314 | 0 | for (int i = 0; i < op->rw.elems; i++) { |
315 | 0 | if (next->comps.unused[i]) { |
316 | 0 | swiz.in[i] = 3 - (i - nb_planes); /* map to unused plane */ |
317 | 0 | continue; |
318 | 0 | } |
319 | | |
320 | 0 | const int idx = nb_planes++; |
321 | 0 | av_assert1(idx <= i); |
322 | 0 | ops->order_src.in[idx] = ops->order_src.in[i]; |
323 | 0 | swiz.in[i] = idx; |
324 | 0 | } |
325 | |
|
326 | 0 | if (nb_planes < op->rw.elems) { |
327 | 0 | op->rw.elems = nb_planes; |
328 | 0 | RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) { |
329 | 0 | .op = SWS_OP_SWIZZLE, |
330 | 0 | .type = op->type, |
331 | 0 | .swizzle = swiz, |
332 | 0 | })); |
333 | 0 | goto retry; |
334 | 0 | } |
335 | 0 | } |
336 | 0 | break; |
337 | | |
338 | 0 | case SWS_OP_SWAP_BYTES: |
339 | | /* Redundant (double) swap */ |
340 | 0 | if (next->op == SWS_OP_SWAP_BYTES) { |
341 | 0 | ff_sws_op_list_remove_at(ops, n, 2); |
342 | 0 | goto retry; |
343 | 0 | } |
344 | 0 | break; |
345 | | |
346 | 0 | case SWS_OP_UNPACK: |
347 | | /* Redundant unpack+pack */ |
348 | 0 | if (next->op == SWS_OP_PACK && next->type == op->type && |
349 | 0 | next->pack.pattern[0] == op->pack.pattern[0] && |
350 | 0 | next->pack.pattern[1] == op->pack.pattern[1] && |
351 | 0 | next->pack.pattern[2] == op->pack.pattern[2] && |
352 | 0 | next->pack.pattern[3] == op->pack.pattern[3]) |
353 | 0 | { |
354 | 0 | ff_sws_op_list_remove_at(ops, n, 2); |
355 | 0 | goto retry; |
356 | 0 | } |
357 | 0 | break; |
358 | | |
359 | 0 | case SWS_OP_LSHIFT: |
360 | 0 | case SWS_OP_RSHIFT: |
361 | | /* Two shifts in the same direction */ |
362 | 0 | if (next->op == op->op) { |
363 | 0 | op->c.u += next->c.u; |
364 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
365 | 0 | goto retry; |
366 | 0 | } |
367 | | |
368 | | /* No-op shift */ |
369 | 0 | if (!op->c.u) { |
370 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
371 | 0 | goto retry; |
372 | 0 | } |
373 | 0 | break; |
374 | | |
375 | 0 | case SWS_OP_CLEAR: |
376 | 0 | for (int i = 0; i < 4; i++) { |
377 | 0 | if (!op->c.q4[i].den) |
378 | 0 | continue; |
379 | | |
380 | 0 | if ((prev->comps.flags[i] & SWS_COMP_ZERO) && |
381 | 0 | !(prev->comps.flags[i] & SWS_COMP_GARBAGE) && |
382 | 0 | op->c.q4[i].num == 0) |
383 | 0 | { |
384 | | /* Redundant clear-to-zero of zero component */ |
385 | 0 | op->c.q4[i].den = 0; |
386 | 0 | } else if (next->comps.unused[i]) { |
387 | | /* Unnecessary clear of unused component */ |
388 | 0 | op->c.q4[i] = (AVRational) {0, 0}; |
389 | 0 | } else if (op->c.q4[i].den) { |
390 | 0 | noop = false; |
391 | 0 | } |
392 | 0 | } |
393 | |
|
394 | 0 | if (noop) { |
395 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
396 | 0 | goto retry; |
397 | 0 | } |
398 | | |
399 | | /* Transitive clear */ |
400 | 0 | if (next->op == SWS_OP_CLEAR) { |
401 | 0 | for (int i = 0; i < 4; i++) { |
402 | 0 | if (next->c.q4[i].den) |
403 | 0 | op->c.q4[i] = next->c.q4[i]; |
404 | 0 | } |
405 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
406 | 0 | goto retry; |
407 | 0 | } |
408 | 0 | break; |
409 | | |
410 | 0 | case SWS_OP_SWIZZLE: |
411 | 0 | for (int i = 0; i < 4; i++) { |
412 | 0 | if (next->comps.unused[i]) |
413 | 0 | continue; |
414 | 0 | if (op->swizzle.in[i] != i) |
415 | 0 | noop = false; |
416 | 0 | } |
417 | | |
418 | | /* Identity swizzle */ |
419 | 0 | if (noop) { |
420 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
421 | 0 | goto retry; |
422 | 0 | } |
423 | | |
424 | | /* Transitive swizzle */ |
425 | 0 | if (next->op == SWS_OP_SWIZZLE) { |
426 | 0 | const SwsSwizzleOp orig = op->swizzle; |
427 | 0 | for (int i = 0; i < 4; i++) |
428 | 0 | op->swizzle.in[i] = orig.in[next->swizzle.in[i]]; |
429 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
430 | 0 | goto retry; |
431 | 0 | } |
432 | | |
433 | | /* Swizzle planes instead of components, if possible */ |
434 | 0 | if (prev->op == SWS_OP_READ && !prev->rw.packed) { |
435 | 0 | for (int dst = 0; dst < prev->rw.elems; dst++) { |
436 | 0 | const int src = op->swizzle.in[dst]; |
437 | 0 | if (src > dst && src < prev->rw.elems) { |
438 | 0 | FFSWAP(int, ops->order_src.in[dst], ops->order_src.in[src]); |
439 | 0 | for (int i = dst; i < 4; i++) { |
440 | 0 | if (op->swizzle.in[i] == dst) |
441 | 0 | op->swizzle.in[i] = src; |
442 | 0 | else if (op->swizzle.in[i] == src) |
443 | 0 | op->swizzle.in[i] = dst; |
444 | 0 | } |
445 | 0 | goto retry; |
446 | 0 | } |
447 | 0 | } |
448 | 0 | } |
449 | | |
450 | 0 | if (next->op == SWS_OP_WRITE && !next->rw.packed) { |
451 | 0 | for (int dst = 0; dst < next->rw.elems; dst++) { |
452 | 0 | const int src = op->swizzle.in[dst]; |
453 | 0 | if (src > dst && src < next->rw.elems) { |
454 | 0 | FFSWAP(int, ops->order_dst.in[dst], ops->order_dst.in[src]); |
455 | 0 | FFSWAP(int, op->swizzle.in[dst], op->swizzle.in[src]); |
456 | 0 | goto retry; |
457 | 0 | } |
458 | 0 | } |
459 | 0 | } |
460 | 0 | break; |
461 | | |
462 | 0 | case SWS_OP_CONVERT: |
463 | | /* No-op conversion */ |
464 | 0 | if (op->type == op->convert.to) { |
465 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
466 | 0 | goto retry; |
467 | 0 | } |
468 | | |
469 | | /* Transitive conversion */ |
470 | 0 | if (next->op == SWS_OP_CONVERT && |
471 | 0 | op->convert.expand == next->convert.expand) |
472 | 0 | { |
473 | 0 | av_assert1(op->convert.to == next->type); |
474 | 0 | op->convert.to = next->convert.to; |
475 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
476 | 0 | goto retry; |
477 | 0 | } |
478 | | |
479 | | /* Conversion followed by integer expansion */ |
480 | 0 | if (next->op == SWS_OP_SCALE && !op->convert.expand && |
481 | 0 | ff_sws_pixel_type_is_int(op->type) && |
482 | 0 | ff_sws_pixel_type_is_int(op->convert.to) && |
483 | 0 | !av_cmp_q(next->c.q, ff_sws_pixel_expand(op->type, op->convert.to))) |
484 | 0 | { |
485 | 0 | op->convert.expand = true; |
486 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
487 | 0 | goto retry; |
488 | 0 | } |
489 | 0 | break; |
490 | | |
491 | 0 | case SWS_OP_MIN: |
492 | 0 | for (int i = 0; i < 4; i++) { |
493 | 0 | if (next->comps.unused[i] || !op->c.q4[i].den) |
494 | 0 | continue; |
495 | 0 | if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0) |
496 | 0 | noop = false; |
497 | 0 | } |
498 | |
|
499 | 0 | if (noop) { |
500 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
501 | 0 | goto retry; |
502 | 0 | } |
503 | 0 | break; |
504 | | |
505 | 0 | case SWS_OP_MAX: |
506 | 0 | for (int i = 0; i < 4; i++) { |
507 | 0 | if (next->comps.unused[i] || !op->c.q4[i].den) |
508 | 0 | continue; |
509 | 0 | if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0) |
510 | 0 | noop = false; |
511 | 0 | } |
512 | |
|
513 | 0 | if (noop) { |
514 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
515 | 0 | goto retry; |
516 | 0 | } |
517 | 0 | break; |
518 | | |
519 | 0 | case SWS_OP_DITHER: |
520 | 0 | for (int i = 0; i < 4; i++) { |
521 | 0 | if (next->comps.unused[i] || op->dither.y_offset[i] < 0) |
522 | 0 | continue; |
523 | 0 | if (prev->comps.flags[i] & SWS_COMP_EXACT) { |
524 | 0 | op->dither.y_offset[i] = -1; /* unnecessary dither */ |
525 | 0 | goto retry; |
526 | 0 | } else { |
527 | 0 | noop = false; |
528 | 0 | } |
529 | 0 | } |
530 | | |
531 | 0 | if (noop) { |
532 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
533 | 0 | goto retry; |
534 | 0 | } |
535 | 0 | break; |
536 | | |
537 | 0 | case SWS_OP_LINEAR: { |
538 | 0 | SwsSwizzleOp swizzle; |
539 | 0 | SwsConst c; |
540 | | |
541 | | /* No-op (identity) linear operation */ |
542 | 0 | if (!op->lin.mask) { |
543 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
544 | 0 | goto retry; |
545 | 0 | } |
546 | | |
547 | 0 | if (next->op == SWS_OP_LINEAR) { |
548 | | /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */ |
549 | 0 | const SwsLinearOp m1 = op->lin; |
550 | 0 | const SwsLinearOp m2 = next->lin; |
551 | 0 | for (int i = 0; i < 4; i++) { |
552 | 0 | for (int j = 0; j < 5; j++) { |
553 | 0 | AVRational sum = Q(0); |
554 | 0 | for (int k = 0; k < 4; k++) |
555 | 0 | sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j])); |
556 | 0 | if (j == 4) /* m1.m[4][j] == 1 */ |
557 | 0 | sum = av_add_q(sum, m2.m[i][4]); |
558 | 0 | op->lin.m[i][j] = sum; |
559 | 0 | } |
560 | 0 | } |
561 | 0 | op->lin.mask = ff_sws_linear_mask(op->lin); |
562 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
563 | 0 | goto retry; |
564 | 0 | } |
565 | | |
566 | | /* Optimize away zero columns */ |
567 | 0 | for (int j = 0; j < 4; j++) { |
568 | 0 | const uint32_t col = SWS_MASK_COL(j); |
569 | 0 | if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col)) |
570 | 0 | continue; |
571 | 0 | for (int i = 0; i < 4; i++) |
572 | 0 | op->lin.m[i][j] = Q(i == j); |
573 | 0 | op->lin.mask &= ~col; |
574 | 0 | goto retry; |
575 | 0 | } |
576 | | |
577 | | /* Optimize away unused rows */ |
578 | 0 | for (int i = 0; i < 4; i++) { |
579 | 0 | const uint32_t row = SWS_MASK_ROW(i); |
580 | 0 | if (!next->comps.unused[i] || !(op->lin.mask & row)) |
581 | 0 | continue; |
582 | 0 | for (int j = 0; j < 5; j++) |
583 | 0 | op->lin.m[i][j] = Q(i == j); |
584 | 0 | op->lin.mask &= ~row; |
585 | 0 | goto retry; |
586 | 0 | } |
587 | | |
588 | | /* Convert constant rows to explicit clear instruction */ |
589 | 0 | if (extract_constant_rows(&op->lin, prev->comps, &c)) { |
590 | 0 | RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) { |
591 | 0 | .op = SWS_OP_CLEAR, |
592 | 0 | .type = op->type, |
593 | 0 | .comps = op->comps, |
594 | 0 | .c = c, |
595 | 0 | })); |
596 | 0 | goto retry; |
597 | 0 | } |
598 | | |
599 | | /* Multiplication by scalar constant */ |
600 | 0 | if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) { |
601 | 0 | op->op = SWS_OP_SCALE; |
602 | 0 | op->c = c; |
603 | 0 | goto retry; |
604 | 0 | } |
605 | | |
606 | | /* Swizzle by fixed pattern */ |
607 | 0 | if (extract_swizzle(&op->lin, prev->comps, &swizzle)) { |
608 | 0 | RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) { |
609 | 0 | .op = SWS_OP_SWIZZLE, |
610 | 0 | .type = op->type, |
611 | 0 | .swizzle = swizzle, |
612 | 0 | })); |
613 | 0 | goto retry; |
614 | 0 | } |
615 | 0 | break; |
616 | 0 | } |
617 | | |
618 | 0 | case SWS_OP_SCALE: { |
619 | 0 | const int factor2 = exact_log2_q(op->c.q); |
620 | | |
621 | | /* No-op scaling */ |
622 | 0 | if (op->c.q.num == 1 && op->c.q.den == 1) { |
623 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
624 | 0 | goto retry; |
625 | 0 | } |
626 | | |
627 | | /* Scaling by exact power of two */ |
628 | 0 | if (factor2 && ff_sws_pixel_type_is_int(op->type)) { |
629 | 0 | op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT; |
630 | 0 | op->c.u = FFABS(factor2); |
631 | 0 | goto retry; |
632 | 0 | } |
633 | 0 | break; |
634 | 0 | } |
635 | 0 | } |
636 | 0 | } |
637 | | |
638 | | /* Push clears to the back to void any unused components */ |
639 | 0 | for (int n = 0; n < ops->num_ops - 1; n++) { |
640 | 0 | SwsOp *op = &ops->ops[n]; |
641 | 0 | SwsOp *next = &ops->ops[n + 1]; |
642 | |
|
643 | 0 | switch (op->op) { |
644 | 0 | case SWS_OP_CLEAR: |
645 | 0 | if (op_commute_clear(op, next)) { |
646 | 0 | FFSWAP(SwsOp, *op, *next); |
647 | 0 | goto retry; |
648 | 0 | } |
649 | 0 | break; |
650 | 0 | } |
651 | 0 | } |
652 | | |
653 | | /* Apply any remaining preferential re-ordering optimizations; do these |
654 | | * last because they are more likely to block other optimizations if done |
655 | | * too aggressively */ |
656 | 0 | for (int n = 0; n < ops->num_ops - 1; n++) { |
657 | 0 | SwsOp *op = &ops->ops[n]; |
658 | 0 | SwsOp *next = &ops->ops[n + 1]; |
659 | |
|
660 | 0 | switch (op->op) { |
661 | 0 | case SWS_OP_SWIZZLE: { |
662 | | /* Try to push swizzles towards the output */ |
663 | 0 | if (op_commute_swizzle(op, next)) { |
664 | 0 | FFSWAP(SwsOp, *op, *next); |
665 | 0 | goto retry; |
666 | 0 | } |
667 | 0 | break; |
668 | 0 | } |
669 | | |
670 | 0 | case SWS_OP_SCALE: |
671 | | /* Scaling by integer before conversion to int */ |
672 | 0 | if (op->c.q.den == 1 && next->op == SWS_OP_CONVERT && |
673 | 0 | ff_sws_pixel_type_is_int(next->convert.to)) |
674 | 0 | { |
675 | 0 | op->type = next->convert.to; |
676 | 0 | FFSWAP(SwsOp, *op, *next); |
677 | 0 | goto retry; |
678 | 0 | } |
679 | 0 | break; |
680 | 0 | } |
681 | 0 | } |
682 | | |
683 | 0 | return 0; |
684 | 0 | } |
685 | | |
686 | | int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[], |
687 | | int size, uint8_t clear_val, |
688 | | int *read_bytes, int *write_bytes) |
689 | 0 | { |
690 | 0 | if (!ops->num_ops) |
691 | 0 | return AVERROR(EINVAL); |
692 | | |
693 | 0 | const SwsOp *read = ff_sws_op_list_input(ops); |
694 | 0 | if (!read || read->rw.frac || (!read->rw.packed && read->rw.elems > 1)) |
695 | 0 | return AVERROR(ENOTSUP); |
696 | | |
697 | 0 | const int read_size = ff_sws_pixel_type_size(read->type); |
698 | 0 | uint32_t mask[4] = {0}; |
699 | 0 | for (int i = 0; i < read->rw.elems; i++) |
700 | 0 | mask[i] = 0x01010101 * i * read_size + 0x03020100; |
701 | |
|
702 | 0 | for (int opidx = 1; opidx < ops->num_ops; opidx++) { |
703 | 0 | const SwsOp *op = &ops->ops[opidx]; |
704 | 0 | switch (op->op) { |
705 | 0 | case SWS_OP_SWIZZLE: { |
706 | 0 | uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] }; |
707 | 0 | for (int i = 0; i < 4; i++) |
708 | 0 | mask[i] = orig[op->swizzle.in[i]]; |
709 | 0 | break; |
710 | 0 | } |
711 | | |
712 | 0 | case SWS_OP_SWAP_BYTES: |
713 | 0 | for (int i = 0; i < 4; i++) { |
714 | 0 | switch (ff_sws_pixel_type_size(op->type)) { |
715 | 0 | case 2: mask[i] = av_bswap16(mask[i]); break; |
716 | 0 | case 4: mask[i] = av_bswap32(mask[i]); break; |
717 | 0 | } |
718 | 0 | } |
719 | 0 | break; |
720 | | |
721 | 0 | case SWS_OP_CLEAR: |
722 | 0 | for (int i = 0; i < 4; i++) { |
723 | 0 | if (!op->c.q4[i].den) |
724 | 0 | continue; |
725 | 0 | if (op->c.q4[i].num != 0 || !clear_val) |
726 | 0 | return AVERROR(ENOTSUP); |
727 | 0 | mask[i] = 0x1010101ul * clear_val; |
728 | 0 | } |
729 | 0 | break; |
730 | | |
731 | 0 | case SWS_OP_CONVERT: { |
732 | 0 | if (!op->convert.expand) |
733 | 0 | return AVERROR(ENOTSUP); |
734 | 0 | for (int i = 0; i < 4; i++) { |
735 | 0 | switch (ff_sws_pixel_type_size(op->type)) { |
736 | 0 | case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break; |
737 | 0 | case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break; |
738 | 0 | } |
739 | 0 | } |
740 | 0 | break; |
741 | 0 | } |
742 | | |
743 | 0 | case SWS_OP_WRITE: { |
744 | 0 | if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1)) |
745 | 0 | return AVERROR(ENOTSUP); |
746 | | |
747 | | /* Initialize to no-op */ |
748 | 0 | memset(shuffle, clear_val, size); |
749 | |
|
750 | 0 | const int write_size = ff_sws_pixel_type_size(op->type); |
751 | 0 | const int read_chunk = read->rw.elems * read_size; |
752 | 0 | const int write_chunk = op->rw.elems * write_size; |
753 | 0 | const int num_groups = size / FFMAX(read_chunk, write_chunk); |
754 | 0 | for (int n = 0; n < num_groups; n++) { |
755 | 0 | const int base_in = n * read_chunk; |
756 | 0 | const int base_out = n * write_chunk; |
757 | 0 | for (int i = 0; i < op->rw.elems; i++) { |
758 | 0 | const int offset = base_out + i * write_size; |
759 | 0 | for (int b = 0; b < write_size; b++) { |
760 | 0 | const uint8_t idx = mask[i] >> (b * 8); |
761 | 0 | if (idx != clear_val) |
762 | 0 | shuffle[offset + b] = base_in + idx; |
763 | 0 | } |
764 | 0 | } |
765 | 0 | } |
766 | |
|
767 | 0 | *read_bytes = num_groups * read_chunk; |
768 | 0 | *write_bytes = num_groups * write_chunk; |
769 | 0 | return num_groups; |
770 | 0 | } |
771 | | |
772 | 0 | default: |
773 | 0 | return AVERROR(ENOTSUP); |
774 | 0 | } |
775 | 0 | } |
776 | | |
777 | 0 | return AVERROR(EINVAL); |
778 | 0 | } |