/src/ffmpeg/libswscale/ops_optimizer.c
Line | Count | Source |
1 | | /** |
2 | | * Copyright (C) 2025 Niklas Haas |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include "libavutil/avassert.h" |
22 | | #include "libavutil/bswap.h" |
23 | | #include "libavutil/rational.h" |
24 | | |
25 | | #include "ops.h" |
26 | | #include "ops_internal.h" |
27 | | |
28 | | #define RET(x) \ |
29 | 0 | do { \ |
30 | 0 | if ((ret = (x)) < 0) \ |
31 | 0 | return ret; \ |
32 | 0 | } while (0) |
33 | | |
34 | | /** |
35 | | * Try to commute a clear op with the next operation. Makes any adjustments |
36 | | * to the operations as needed, but does not perform the actual commutation. |
37 | | * |
38 | | * Returns whether successful. |
39 | | */ |
40 | | static bool op_commute_clear(SwsOp *op, SwsOp *next) |
41 | 0 | { |
42 | 0 | av_assert1(op->op == SWS_OP_CLEAR); |
43 | 0 | switch (next->op) { |
44 | 0 | case SWS_OP_CONVERT: |
45 | 0 | op->type = next->convert.to; |
46 | | /* fall through */ |
47 | 0 | case SWS_OP_LSHIFT: |
48 | 0 | case SWS_OP_RSHIFT: |
49 | 0 | case SWS_OP_DITHER: |
50 | 0 | case SWS_OP_MIN: |
51 | 0 | case SWS_OP_MAX: |
52 | 0 | case SWS_OP_SCALE: |
53 | 0 | case SWS_OP_READ: |
54 | 0 | case SWS_OP_SWIZZLE: |
55 | 0 | ff_sws_apply_op_q(next, op->c.q4); |
56 | 0 | return true; |
57 | 0 | case SWS_OP_INVALID: |
58 | 0 | case SWS_OP_SWAP_BYTES: |
59 | 0 | case SWS_OP_WRITE: |
60 | 0 | case SWS_OP_LINEAR: |
61 | 0 | case SWS_OP_PACK: |
62 | 0 | case SWS_OP_UNPACK: |
63 | 0 | case SWS_OP_CLEAR: |
64 | 0 | return false; |
65 | 0 | case SWS_OP_TYPE_NB: |
66 | 0 | break; |
67 | 0 | } |
68 | | |
69 | 0 | av_unreachable("Invalid operation type!"); |
70 | 0 | return false; |
71 | 0 | } |
72 | | |
73 | | /** |
74 | | * Try to commute a swizzle op with the next operation. Makes any adjustments |
75 | | * to the operations as needed, but does not perform the actual commutation. |
76 | | * |
77 | | * Returns whether successful. |
78 | | */ |
79 | | static bool op_commute_swizzle(SwsOp *op, SwsOp *next) |
80 | 0 | { |
81 | 0 | bool seen[4] = {0}; |
82 | |
|
83 | 0 | av_assert1(op->op == SWS_OP_SWIZZLE); |
84 | 0 | switch (next->op) { |
85 | 0 | case SWS_OP_CONVERT: |
86 | 0 | op->type = next->convert.to; |
87 | | /* fall through */ |
88 | 0 | case SWS_OP_SWAP_BYTES: |
89 | 0 | case SWS_OP_LSHIFT: |
90 | 0 | case SWS_OP_RSHIFT: |
91 | 0 | case SWS_OP_SCALE: |
92 | 0 | return true; |
93 | | |
94 | | /** |
95 | | * We can commute per-channel ops only if the per-channel constants are the |
96 | | * same for all duplicated channels; e.g.: |
97 | | * SWIZZLE {0, 0, 0, 3} |
98 | | * NEXT {x, x, x, w} |
99 | | * -> |
100 | | * NEXT {x, _, _, w} |
101 | | * SWIZZLE {0, 0, 0, 3} |
102 | | */ |
103 | 0 | case SWS_OP_MIN: |
104 | 0 | case SWS_OP_MAX: { |
105 | 0 | const SwsConst c = next->c; |
106 | 0 | for (int i = 0; i < 4; i++) { |
107 | 0 | if (next->comps.unused[i]) |
108 | 0 | continue; |
109 | 0 | const int j = op->swizzle.in[i]; |
110 | 0 | if (seen[j] && av_cmp_q(next->c.q4[j], c.q4[i])) |
111 | 0 | return false; |
112 | 0 | next->c.q4[j] = c.q4[i]; |
113 | 0 | seen[j] = true; |
114 | 0 | } |
115 | 0 | return true; |
116 | 0 | } |
117 | | |
118 | 0 | case SWS_OP_DITHER: { |
119 | 0 | const SwsDitherOp d = next->dither; |
120 | 0 | for (int i = 0; i < 4; i++) { |
121 | 0 | if (next->comps.unused[i]) |
122 | 0 | continue; |
123 | 0 | const int j = op->swizzle.in[i]; |
124 | 0 | if (seen[j] && next->dither.y_offset[j] != d.y_offset[i]) |
125 | 0 | return false; |
126 | 0 | next->dither.y_offset[j] = d.y_offset[i]; |
127 | 0 | seen[j] = true; |
128 | 0 | } |
129 | 0 | return true; |
130 | 0 | } |
131 | | |
132 | 0 | case SWS_OP_INVALID: |
133 | 0 | case SWS_OP_READ: |
134 | 0 | case SWS_OP_WRITE: |
135 | 0 | case SWS_OP_SWIZZLE: |
136 | 0 | case SWS_OP_CLEAR: |
137 | 0 | case SWS_OP_LINEAR: |
138 | 0 | case SWS_OP_PACK: |
139 | 0 | case SWS_OP_UNPACK: |
140 | 0 | return false; |
141 | 0 | case SWS_OP_TYPE_NB: |
142 | 0 | break; |
143 | 0 | } |
144 | | |
145 | 0 | av_unreachable("Invalid operation type!"); |
146 | 0 | return false; |
147 | 0 | } |
148 | | |
149 | | /* returns log2(x) only if x is a power of two, or 0 otherwise */ |
150 | | static int exact_log2(const int x) |
151 | 0 | { |
152 | 0 | int p; |
153 | 0 | if (x <= 0) |
154 | 0 | return 0; |
155 | 0 | p = av_log2(x); |
156 | 0 | return (1 << p) == x ? p : 0; |
157 | 0 | } |
158 | | |
159 | | static int exact_log2_q(const AVRational x) |
160 | 0 | { |
161 | 0 | if (x.den == 1) |
162 | 0 | return exact_log2(x.num); |
163 | 0 | else if (x.num == 1) |
164 | 0 | return -exact_log2(x.den); |
165 | 0 | else |
166 | 0 | return 0; |
167 | 0 | } |
168 | | |
169 | | /** |
170 | | * If a linear operation can be reduced to a scalar multiplication, returns |
171 | | * the corresponding scaling factor, or 0 otherwise. |
172 | | */ |
173 | | static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next, |
174 | | SwsConst *out_scale) |
175 | 0 | { |
176 | 0 | SwsConst scale = {0}; |
177 | | |
178 | | /* There are components not on the main diagonal */ |
179 | 0 | if (c->mask & ~SWS_MASK_DIAG4) |
180 | 0 | return false; |
181 | | |
182 | 0 | for (int i = 0; i < 4; i++) { |
183 | 0 | const AVRational s = c->m[i][i]; |
184 | 0 | if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i]) |
185 | 0 | continue; |
186 | 0 | if (scale.q.den && av_cmp_q(s, scale.q)) |
187 | 0 | return false; |
188 | 0 | scale.q = s; |
189 | 0 | } |
190 | | |
191 | 0 | if (scale.q.den) |
192 | 0 | *out_scale = scale; |
193 | 0 | return scale.q.den; |
194 | 0 | } |
195 | | |
196 | | /* Extracts an integer clear operation (subset) from the given linear op. */ |
197 | | static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev, |
198 | | SwsConst *out_clear) |
199 | 0 | { |
200 | 0 | SwsConst clear = {0}; |
201 | 0 | bool ret = false; |
202 | |
|
203 | 0 | for (int i = 0; i < 4; i++) { |
204 | 0 | bool const_row = c->m[i][4].den == 1; /* offset is integer */ |
205 | 0 | for (int j = 0; j < 4; j++) { |
206 | 0 | const_row &= c->m[i][j].num == 0 || /* scalar is zero */ |
207 | 0 | (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */ |
208 | 0 | } |
209 | 0 | if (const_row && (c->mask & SWS_MASK_ROW(i))) { |
210 | 0 | clear.q4[i] = c->m[i][4]; |
211 | 0 | for (int j = 0; j < 5; j++) |
212 | 0 | c->m[i][j] = Q(i == j); |
213 | 0 | c->mask &= ~SWS_MASK_ROW(i); |
214 | 0 | ret = true; |
215 | 0 | } |
216 | 0 | } |
217 | |
|
218 | 0 | if (ret) |
219 | 0 | *out_clear = clear; |
220 | 0 | return ret; |
221 | 0 | } |
222 | | |
223 | | /* Unswizzle a linear operation by aligning single-input rows with |
224 | | * their corresponding diagonal */ |
225 | | static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz) |
226 | 0 | { |
227 | 0 | SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3); |
228 | 0 | SwsLinearOp c = *op; |
229 | |
|
230 | 0 | for (int i = 0; i < 4; i++) { |
231 | 0 | int idx = -1; |
232 | 0 | for (int j = 0; j < 4; j++) { |
233 | 0 | if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO)) |
234 | 0 | continue; |
235 | 0 | if (idx >= 0) |
236 | 0 | return false; /* multiple inputs */ |
237 | 0 | idx = j; |
238 | 0 | } |
239 | | |
240 | 0 | if (idx >= 0 && idx != i) { |
241 | | /* Move coefficient to the diagonal */ |
242 | 0 | c.m[i][i] = c.m[i][idx]; |
243 | 0 | c.m[i][idx] = Q(0); |
244 | 0 | swiz.in[i] = idx; |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | 0 | if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask) |
249 | 0 | return false; /* no swizzle was identified */ |
250 | | |
251 | 0 | c.mask = ff_sws_linear_mask(c); |
252 | 0 | *out_swiz = swiz; |
253 | 0 | *op = c; |
254 | 0 | return true; |
255 | 0 | } |
256 | | |
257 | | int ff_sws_op_list_optimize(SwsOpList *ops) |
258 | 0 | { |
259 | 0 | int ret; |
260 | |
|
261 | 0 | retry: |
262 | 0 | ff_sws_op_list_update_comps(ops); |
263 | | |
264 | | /* Apply all in-place optimizations (that do not re-order the list) */ |
265 | 0 | for (int n = 0; n < ops->num_ops; n++) { |
266 | 0 | SwsOp dummy = {0}; |
267 | 0 | SwsOp *op = &ops->ops[n]; |
268 | 0 | SwsOp *prev = n ? &ops->ops[n - 1] : &dummy; |
269 | 0 | SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy; |
270 | | |
271 | | /* common helper variable */ |
272 | 0 | bool noop = true; |
273 | |
|
274 | 0 | switch (op->op) { |
275 | 0 | case SWS_OP_READ: |
276 | | /* Optimized further into refcopy / memcpy */ |
277 | 0 | if (next->op == SWS_OP_WRITE && |
278 | 0 | next->rw.elems == op->rw.elems && |
279 | 0 | next->rw.packed == op->rw.packed && |
280 | 0 | next->rw.frac == op->rw.frac) |
281 | 0 | { |
282 | 0 | ff_sws_op_list_remove_at(ops, n, 2); |
283 | 0 | av_assert1(ops->num_ops == 0); |
284 | 0 | return 0; |
285 | 0 | } |
286 | | |
287 | | /* Skip reading extra unneeded components */ |
288 | 0 | if (!op->rw.packed) { |
289 | 0 | int needed = op->rw.elems; |
290 | 0 | while (needed > 0 && next->comps.unused[needed - 1]) |
291 | 0 | needed--; |
292 | 0 | if (op->rw.elems != needed) { |
293 | 0 | op->rw.elems = needed; |
294 | 0 | goto retry; |
295 | 0 | } |
296 | 0 | } |
297 | 0 | break; |
298 | | |
299 | 0 | case SWS_OP_SWAP_BYTES: |
300 | | /* Redundant (double) swap */ |
301 | 0 | if (next->op == SWS_OP_SWAP_BYTES) { |
302 | 0 | ff_sws_op_list_remove_at(ops, n, 2); |
303 | 0 | goto retry; |
304 | 0 | } |
305 | 0 | break; |
306 | | |
307 | 0 | case SWS_OP_UNPACK: |
308 | | /* Redundant unpack+pack */ |
309 | 0 | if (next->op == SWS_OP_PACK && next->type == op->type && |
310 | 0 | next->pack.pattern[0] == op->pack.pattern[0] && |
311 | 0 | next->pack.pattern[1] == op->pack.pattern[1] && |
312 | 0 | next->pack.pattern[2] == op->pack.pattern[2] && |
313 | 0 | next->pack.pattern[3] == op->pack.pattern[3]) |
314 | 0 | { |
315 | 0 | ff_sws_op_list_remove_at(ops, n, 2); |
316 | 0 | goto retry; |
317 | 0 | } |
318 | 0 | break; |
319 | | |
320 | 0 | case SWS_OP_LSHIFT: |
321 | 0 | case SWS_OP_RSHIFT: |
322 | | /* Two shifts in the same direction */ |
323 | 0 | if (next->op == op->op) { |
324 | 0 | op->c.u += next->c.u; |
325 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
326 | 0 | goto retry; |
327 | 0 | } |
328 | | |
329 | | /* No-op shift */ |
330 | 0 | if (!op->c.u) { |
331 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
332 | 0 | goto retry; |
333 | 0 | } |
334 | 0 | break; |
335 | | |
336 | 0 | case SWS_OP_CLEAR: |
337 | 0 | for (int i = 0; i < 4; i++) { |
338 | 0 | if (!op->c.q4[i].den) |
339 | 0 | continue; |
340 | | |
341 | 0 | if ((prev->comps.flags[i] & SWS_COMP_ZERO) && |
342 | 0 | !(prev->comps.flags[i] & SWS_COMP_GARBAGE) && |
343 | 0 | op->c.q4[i].num == 0) |
344 | 0 | { |
345 | | /* Redundant clear-to-zero of zero component */ |
346 | 0 | op->c.q4[i].den = 0; |
347 | 0 | } else if (next->comps.unused[i]) { |
348 | | /* Unnecessary clear of unused component */ |
349 | 0 | op->c.q4[i] = (AVRational) {0, 0}; |
350 | 0 | } else if (op->c.q4[i].den) { |
351 | 0 | noop = false; |
352 | 0 | } |
353 | 0 | } |
354 | |
|
355 | 0 | if (noop) { |
356 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
357 | 0 | goto retry; |
358 | 0 | } |
359 | | |
360 | | /* Transitive clear */ |
361 | 0 | if (next->op == SWS_OP_CLEAR) { |
362 | 0 | for (int i = 0; i < 4; i++) { |
363 | 0 | if (next->c.q4[i].den) |
364 | 0 | op->c.q4[i] = next->c.q4[i]; |
365 | 0 | } |
366 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
367 | 0 | goto retry; |
368 | 0 | } |
369 | 0 | break; |
370 | | |
371 | 0 | case SWS_OP_SWIZZLE: |
372 | 0 | for (int i = 0; i < 4; i++) { |
373 | 0 | if (next->comps.unused[i]) |
374 | 0 | continue; |
375 | 0 | if (op->swizzle.in[i] != i) |
376 | 0 | noop = false; |
377 | 0 | } |
378 | | |
379 | | /* Identity swizzle */ |
380 | 0 | if (noop) { |
381 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
382 | 0 | goto retry; |
383 | 0 | } |
384 | | |
385 | | /* Transitive swizzle */ |
386 | 0 | if (next->op == SWS_OP_SWIZZLE) { |
387 | 0 | const SwsSwizzleOp orig = op->swizzle; |
388 | 0 | for (int i = 0; i < 4; i++) |
389 | 0 | op->swizzle.in[i] = orig.in[next->swizzle.in[i]]; |
390 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
391 | 0 | goto retry; |
392 | 0 | } |
393 | 0 | break; |
394 | | |
395 | 0 | case SWS_OP_CONVERT: |
396 | | /* No-op conversion */ |
397 | 0 | if (op->type == op->convert.to) { |
398 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
399 | 0 | goto retry; |
400 | 0 | } |
401 | | |
402 | | /* Transitive conversion */ |
403 | 0 | if (next->op == SWS_OP_CONVERT && |
404 | 0 | op->convert.expand == next->convert.expand) |
405 | 0 | { |
406 | 0 | av_assert1(op->convert.to == next->type); |
407 | 0 | op->convert.to = next->convert.to; |
408 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
409 | 0 | goto retry; |
410 | 0 | } |
411 | | |
412 | | /* Conversion followed by integer expansion */ |
413 | 0 | if (next->op == SWS_OP_SCALE && !op->convert.expand && |
414 | 0 | !av_cmp_q(next->c.q, ff_sws_pixel_expand(op->type, op->convert.to))) |
415 | 0 | { |
416 | 0 | op->convert.expand = true; |
417 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
418 | 0 | goto retry; |
419 | 0 | } |
420 | 0 | break; |
421 | | |
422 | 0 | case SWS_OP_MIN: |
423 | 0 | for (int i = 0; i < 4; i++) { |
424 | 0 | if (next->comps.unused[i] || !op->c.q4[i].den) |
425 | 0 | continue; |
426 | 0 | if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0) |
427 | 0 | noop = false; |
428 | 0 | } |
429 | |
|
430 | 0 | if (noop) { |
431 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
432 | 0 | goto retry; |
433 | 0 | } |
434 | 0 | break; |
435 | | |
436 | 0 | case SWS_OP_MAX: |
437 | 0 | for (int i = 0; i < 4; i++) { |
438 | 0 | if (next->comps.unused[i] || !op->c.q4[i].den) |
439 | 0 | continue; |
440 | 0 | if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0) |
441 | 0 | noop = false; |
442 | 0 | } |
443 | |
|
444 | 0 | if (noop) { |
445 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
446 | 0 | goto retry; |
447 | 0 | } |
448 | 0 | break; |
449 | | |
450 | 0 | case SWS_OP_DITHER: |
451 | 0 | for (int i = 0; i < 4; i++) { |
452 | 0 | noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) || |
453 | 0 | next->comps.unused[i]; |
454 | 0 | } |
455 | |
|
456 | 0 | if (noop) { |
457 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
458 | 0 | goto retry; |
459 | 0 | } |
460 | 0 | break; |
461 | | |
462 | 0 | case SWS_OP_LINEAR: { |
463 | 0 | SwsSwizzleOp swizzle; |
464 | 0 | SwsConst c; |
465 | | |
466 | | /* No-op (identity) linear operation */ |
467 | 0 | if (!op->lin.mask) { |
468 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
469 | 0 | goto retry; |
470 | 0 | } |
471 | | |
472 | 0 | if (next->op == SWS_OP_LINEAR) { |
473 | | /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */ |
474 | 0 | const SwsLinearOp m1 = op->lin; |
475 | 0 | const SwsLinearOp m2 = next->lin; |
476 | 0 | for (int i = 0; i < 4; i++) { |
477 | 0 | for (int j = 0; j < 5; j++) { |
478 | 0 | AVRational sum = Q(0); |
479 | 0 | for (int k = 0; k < 4; k++) |
480 | 0 | sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j])); |
481 | 0 | if (j == 4) /* m1.m[4][j] == 1 */ |
482 | 0 | sum = av_add_q(sum, m2.m[i][4]); |
483 | 0 | op->lin.m[i][j] = sum; |
484 | 0 | } |
485 | 0 | } |
486 | 0 | op->lin.mask = ff_sws_linear_mask(op->lin); |
487 | 0 | ff_sws_op_list_remove_at(ops, n + 1, 1); |
488 | 0 | goto retry; |
489 | 0 | } |
490 | | |
491 | | /* Optimize away zero columns */ |
492 | 0 | for (int j = 0; j < 4; j++) { |
493 | 0 | const uint32_t col = SWS_MASK_COL(j); |
494 | 0 | if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col)) |
495 | 0 | continue; |
496 | 0 | for (int i = 0; i < 4; i++) |
497 | 0 | op->lin.m[i][j] = Q(i == j); |
498 | 0 | op->lin.mask &= ~col; |
499 | 0 | goto retry; |
500 | 0 | } |
501 | | |
502 | | /* Optimize away unused rows */ |
503 | 0 | for (int i = 0; i < 4; i++) { |
504 | 0 | const uint32_t row = SWS_MASK_ROW(i); |
505 | 0 | if (!next->comps.unused[i] || !(op->lin.mask & row)) |
506 | 0 | continue; |
507 | 0 | for (int j = 0; j < 5; j++) |
508 | 0 | op->lin.m[i][j] = Q(i == j); |
509 | 0 | op->lin.mask &= ~row; |
510 | 0 | goto retry; |
511 | 0 | } |
512 | | |
513 | | /* Convert constant rows to explicit clear instruction */ |
514 | 0 | if (extract_constant_rows(&op->lin, prev->comps, &c)) { |
515 | 0 | RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) { |
516 | 0 | .op = SWS_OP_CLEAR, |
517 | 0 | .type = op->type, |
518 | 0 | .comps = op->comps, |
519 | 0 | .c = c, |
520 | 0 | })); |
521 | 0 | goto retry; |
522 | 0 | } |
523 | | |
524 | | /* Multiplication by scalar constant */ |
525 | 0 | if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) { |
526 | 0 | op->op = SWS_OP_SCALE; |
527 | 0 | op->c = c; |
528 | 0 | goto retry; |
529 | 0 | } |
530 | | |
531 | | /* Swizzle by fixed pattern */ |
532 | 0 | if (extract_swizzle(&op->lin, prev->comps, &swizzle)) { |
533 | 0 | RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) { |
534 | 0 | .op = SWS_OP_SWIZZLE, |
535 | 0 | .type = op->type, |
536 | 0 | .swizzle = swizzle, |
537 | 0 | })); |
538 | 0 | goto retry; |
539 | 0 | } |
540 | 0 | break; |
541 | 0 | } |
542 | | |
543 | 0 | case SWS_OP_SCALE: { |
544 | 0 | const int factor2 = exact_log2_q(op->c.q); |
545 | | |
546 | | /* No-op scaling */ |
547 | 0 | if (op->c.q.num == 1 && op->c.q.den == 1) { |
548 | 0 | ff_sws_op_list_remove_at(ops, n, 1); |
549 | 0 | goto retry; |
550 | 0 | } |
551 | | |
552 | | /* Scaling by exact power of two */ |
553 | 0 | if (factor2 && ff_sws_pixel_type_is_int(op->type)) { |
554 | 0 | op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT; |
555 | 0 | op->c.u = FFABS(factor2); |
556 | 0 | goto retry; |
557 | 0 | } |
558 | 0 | break; |
559 | 0 | } |
560 | 0 | } |
561 | 0 | } |
562 | | |
563 | | /* Push clears to the back to void any unused components */ |
564 | 0 | for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */ |
565 | 0 | SwsOp *op = &ops->ops[n]; |
566 | 0 | SwsOp *next = &ops->ops[n + 1]; |
567 | |
|
568 | 0 | switch (op->op) { |
569 | 0 | case SWS_OP_CLEAR: |
570 | 0 | if (op_commute_clear(op, next)) { |
571 | 0 | FFSWAP(SwsOp, *op, *next); |
572 | 0 | goto retry; |
573 | 0 | } |
574 | 0 | break; |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | /* Apply any remaining preferential re-ordering optimizations; do these |
579 | | * last because they are more likely to block other optimizations if done |
580 | | * too aggressively */ |
581 | 0 | for (int n = 1; n < ops->num_ops - 1; n++) { /* exclude READ/WRITE */ |
582 | 0 | SwsOp *op = &ops->ops[n]; |
583 | 0 | SwsOp *prev = &ops->ops[n - 1]; |
584 | 0 | SwsOp *next = &ops->ops[n + 1]; |
585 | |
|
586 | 0 | switch (op->op) { |
587 | 0 | case SWS_OP_SWIZZLE: { |
588 | 0 | bool seen[4] = {0}; |
589 | 0 | bool has_duplicates = false; |
590 | 0 | for (int i = 0; i < 4; i++) { |
591 | 0 | if (next->comps.unused[i]) |
592 | 0 | continue; |
593 | 0 | has_duplicates |= seen[op->swizzle.in[i]]; |
594 | 0 | seen[op->swizzle.in[i]] = true; |
595 | 0 | } |
596 | | |
597 | | /* Try to push swizzles with duplicates towards the output */ |
598 | 0 | if (has_duplicates && op_commute_swizzle(op, next)) { |
599 | 0 | FFSWAP(SwsOp, *op, *next); |
600 | 0 | goto retry; |
601 | 0 | } |
602 | | |
603 | | /* Move swizzle out of the way between two converts so that |
604 | | * they may be merged */ |
605 | 0 | if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) { |
606 | 0 | op->type = next->convert.to; |
607 | 0 | FFSWAP(SwsOp, *op, *next); |
608 | 0 | goto retry; |
609 | 0 | } |
610 | 0 | break; |
611 | 0 | } |
612 | | |
613 | 0 | case SWS_OP_SCALE: |
614 | | /* Scaling by integer before conversion to int */ |
615 | 0 | if (op->c.q.den == 1 && next->op == SWS_OP_CONVERT && |
616 | 0 | ff_sws_pixel_type_is_int(next->convert.to)) |
617 | 0 | { |
618 | 0 | op->type = next->convert.to; |
619 | 0 | FFSWAP(SwsOp, *op, *next); |
620 | 0 | goto retry; |
621 | 0 | } |
622 | 0 | break; |
623 | 0 | } |
624 | 0 | } |
625 | | |
626 | 0 | return 0; |
627 | 0 | } |
628 | | |
629 | | int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[], |
630 | | int size, uint8_t clear_val, |
631 | | int *read_bytes, int *write_bytes) |
632 | 0 | { |
633 | 0 | const SwsOp read = ops->ops[0]; |
634 | 0 | const int read_size = ff_sws_pixel_type_size(read.type); |
635 | 0 | uint32_t mask[4] = {0}; |
636 | |
|
637 | 0 | if (!ops->num_ops || read.op != SWS_OP_READ) |
638 | 0 | return AVERROR(EINVAL); |
639 | 0 | if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1)) |
640 | 0 | return AVERROR(ENOTSUP); |
641 | | |
642 | 0 | for (int i = 0; i < read.rw.elems; i++) |
643 | 0 | mask[i] = 0x01010101 * i * read_size + 0x03020100; |
644 | |
|
645 | 0 | for (int opidx = 1; opidx < ops->num_ops; opidx++) { |
646 | 0 | const SwsOp *op = &ops->ops[opidx]; |
647 | 0 | switch (op->op) { |
648 | 0 | case SWS_OP_SWIZZLE: { |
649 | 0 | uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] }; |
650 | 0 | for (int i = 0; i < 4; i++) |
651 | 0 | mask[i] = orig[op->swizzle.in[i]]; |
652 | 0 | break; |
653 | 0 | } |
654 | | |
655 | 0 | case SWS_OP_SWAP_BYTES: |
656 | 0 | for (int i = 0; i < 4; i++) { |
657 | 0 | switch (ff_sws_pixel_type_size(op->type)) { |
658 | 0 | case 2: mask[i] = av_bswap16(mask[i]); break; |
659 | 0 | case 4: mask[i] = av_bswap32(mask[i]); break; |
660 | 0 | } |
661 | 0 | } |
662 | 0 | break; |
663 | | |
664 | 0 | case SWS_OP_CLEAR: |
665 | 0 | for (int i = 0; i < 4; i++) { |
666 | 0 | if (!op->c.q4[i].den) |
667 | 0 | continue; |
668 | 0 | if (op->c.q4[i].num != 0 || !clear_val) |
669 | 0 | return AVERROR(ENOTSUP); |
670 | 0 | mask[i] = 0x1010101ul * clear_val; |
671 | 0 | } |
672 | 0 | break; |
673 | | |
674 | 0 | case SWS_OP_CONVERT: { |
675 | 0 | if (!op->convert.expand) |
676 | 0 | return AVERROR(ENOTSUP); |
677 | 0 | for (int i = 0; i < 4; i++) { |
678 | 0 | switch (ff_sws_pixel_type_size(op->type)) { |
679 | 0 | case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break; |
680 | 0 | case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break; |
681 | 0 | } |
682 | 0 | } |
683 | 0 | break; |
684 | 0 | } |
685 | | |
686 | 0 | case SWS_OP_WRITE: { |
687 | 0 | if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1)) |
688 | 0 | return AVERROR(ENOTSUP); |
689 | | |
690 | | /* Initialize to no-op */ |
691 | 0 | memset(shuffle, clear_val, size); |
692 | |
|
693 | 0 | const int write_size = ff_sws_pixel_type_size(op->type); |
694 | 0 | const int read_chunk = read.rw.elems * read_size; |
695 | 0 | const int write_chunk = op->rw.elems * write_size; |
696 | 0 | const int num_groups = size / FFMAX(read_chunk, write_chunk); |
697 | 0 | for (int n = 0; n < num_groups; n++) { |
698 | 0 | const int base_in = n * read_chunk; |
699 | 0 | const int base_out = n * write_chunk; |
700 | 0 | for (int i = 0; i < op->rw.elems; i++) { |
701 | 0 | const int offset = base_out + i * write_size; |
702 | 0 | for (int b = 0; b < write_size; b++) { |
703 | 0 | const uint8_t idx = mask[i] >> (b * 8); |
704 | 0 | if (idx != clear_val) |
705 | 0 | shuffle[offset + b] = base_in + idx; |
706 | 0 | } |
707 | 0 | } |
708 | 0 | } |
709 | |
|
710 | 0 | *read_bytes = num_groups * read_chunk; |
711 | 0 | *write_bytes = num_groups * write_chunk; |
712 | 0 | return num_groups; |
713 | 0 | } |
714 | | |
715 | 0 | default: |
716 | 0 | return AVERROR(ENOTSUP); |
717 | 0 | } |
718 | 0 | } |
719 | | |
720 | 0 | return AVERROR(EINVAL); |
721 | 0 | } |